def check_high_offset_selects(_, queries): """ :type _ indexdigest.database.Database :type queries list[str] :rtype: list[LinterEntry] """ for query in queries: # ignore insert queries (#140) if not is_select_query(query): continue res = get_query_limit_and_offset(query) if res is None: continue (limit, offset) = res if offset < OFFSET_THRESHOLD: continue table_name = get_query_tables(query)[0] context = OrderedDict() context['query'] = query context['limit'] = limit context['offset'] = offset yield LinterEntry( linter_type='high_offset_selects', table_name=table_name, message='"{}" query uses too high offset impacting the performance' .format(shorten_query(query)), context=context)
def check_not_used_columns(database, queries): """ :type database indexdigest.database.Database :type queries list[str] :rtype: list[LinterEntry] :raises Exception """ logger = logging.getLogger(__name__) # analyze only SELECT queries from the log queries = list(filter(is_select_query, queries)) used_tables = get_used_tables_from_queries(queries) used_columns = defaultdict(list) logger.info("Will check these tables: %s", used_tables) # analyze given queries and collect used columns for each table for query in queries: tables = get_query_tables(query) if tables: columns = get_query_columns(query) # print(query, table, columns) # add used columns per table # FIXME: assume we're querying just a single table for now used_columns[tables[0]] += columns else: logger.error( 'Unable to extract tables and columns used from the query: %s', query) # analyze table schemas and report not used columns for each table for table in used_tables: logger.info("Checking %s table", table) table_columns = database.get_table_columns(table) # now get the difference and report them not_used_columns = [ column for column in table_columns if column.name not in set(used_columns[table]) ] if table_columns else [] for column in not_used_columns: yield LinterEntry( linter_type='not_used_columns', table_name=table, message='"{}" column was not used by provided queries'.format( column), context={ 'column_name': column.name, 'column_type': column.type })
def get_used_tables_from_queries(queries): """ :type queries list[str] :rtype: list[str] """ used_tables = [] queries = filter(is_select_query, queries) for query in queries: # parse each query from the log tables = get_query_tables(query) if tables and tables[0] not in used_tables: used_tables.append(tables[0]) return used_tables
def check_having_clause(_, queries): """ :type queries list[str] :rtype: list[LinterEntry] """ queries_with_having_clause = [ query for query in queries if query_has_having_clause(query) ] for query in queries_with_having_clause: table_name = get_query_tables(query)[0] yield LinterEntry(linter_type='having_clause', table_name=table_name, message='"{}" query uses HAVING clause'.format( shorten_query(query)), context={"query": query})
def check_select_star(_, queries): """ :type queries list[str] :rtype: list[LinterEntry] """ queries_with_wildcard = [ query for query in queries if is_wildcard_query(query) ] for query in queries_with_wildcard: table_name = get_query_tables(query)[0] yield LinterEntry(linter_type='select_star', table_name=table_name, message='"{}" query uses SELECT *'.format( shorten_query(query)), context={"query": query})
def check_insert_ignore_queries(database, queries): """ :type database indexdigest.database.Database :type queries list[str] :rtype: list[LinterEntry] """ queries = [query for query in queries if is_insert_ignore_query(query)] for query in queries: table_used = get_query_tables(query)[0] context = OrderedDict() context['query'] = query context['schema'] = database.get_table_schema(table_used) yield LinterEntry(linter_type='insert_ignore', table_name=table_used, message='"{}" query uses a risky INSERT IGNORE'. format(shorten_query(query)), context=context)
def test_get_query_tables(self): self.assertListEqual(['test_table'], get_query_tables('SELECT * FROM `test_table`')) self.assertListEqual( ['0001_test_table'], get_query_tables('SELECT * FROM `0001_test_table`')) self.assertListEqual(['test_table'], get_query_tables('SELECT foo FROM `test_table`')) self.assertListEqual( ['test_table'], get_query_tables('SELECT foo FROM test_table WHERE id = 1')) self.assertListEqual( ['test_table', 'second_table'], get_query_tables( 'SELECT foo FROM test_table, second_table WHERE id = 1')) self.assertListEqual( ['revision', 'page', 'wikicities_user'], get_query_tables( 'SELECT rev_id,rev_page,rev_text_id,rev_timestamp,rev_comment,rev_user_text,rev_user,rev_minor_edit,rev_deleted,rev_len,rev_parent_id,rev_shaN,page_namespace,page_title,page_id,page_latest,user_name FROM `revision` INNER JOIN `page` ON ((page_id = rev_page)) LEFT JOIN `wikicities_user` ON ((rev_user != N) AND (user_id = rev_user)) WHERE rev_id = X LIMIT N' )) self.assertListEqual( ['events'], get_query_tables( "SELECT COUNT( 0 ) AS cnt, date_format(event_date, '%Y-%m-%d') AS date FROM events WHERE event_date BETWEEN '2017-10-18 00:00:00' AND '2017-10-24 23:59:59' AND wiki_id = '1289985' GROUP BY date WITH ROLLUP" )) # complex queries # @see https://github.com/macbre/query-digest/issues/16 self.assertListEqual( ['report_wiki_recent_pageviews', 'dimension_wikis'], get_query_tables( "SELECT r.wiki_id AS id, pageviews_Nday AS pageviews FROM report_wiki_recent_pageviews AS r INNER JOIN dimension_wikis AS d ON r.wiki_id = d.wiki_id WHERE d.public = X AND r.lang = X AND r.hub_name = X ORDER BY pageviews DESC LIMIT N" )) self.assertListEqual( ['dimension_wikis', 'fact_wam_scores'], get_query_tables( "SELECT DISTINCT dw.lang FROM `dimension_wikis` `dw` INNER JOIN `fact_wam_scores` `fwN` ON ((dw.wiki_id = fwN.wiki_id)) WHERE fwN.time_id = FROM_UNIXTIME(N) ORDER BY dw.lang ASC" )) self.assertListEqual( ['fact_wam_scores', 'dimension_wikis'], get_query_tables( "SELECT count(fwN.wiki_id) as wam_results_total FROM `fact_wam_scores` `fwN` left join `fact_wam_scores` `fwN` ON ((fwN.wiki_id = fwN.wiki_id) AND (fwN.time_id = FROM_UNIXTIME(N))) left join `dimension_wikis` `dw` ON ((fwN.wiki_id = dw.wiki_id)) WHERE (fwN.time_id = FROM_UNIXTIME(N)) AND (dw.url like X OR dw.title like X) AND fwN.vertical_id IN (XYZ) AND dw.lang = X AND (fwN.wiki_id NOT IN (XYZ)) AND ((dw.url IS NOT NULL AND dw.title IS NOT NULL))" )) # INSERT queries self.assertListEqual( ['0070_insert_ignore_table'], get_query_tables( "INSERT IGNORE INTO `0070_insert_ignore_table` VALUES (9, '123', '2017-01-01');" )) self.assertListEqual( ['0070_insert_ignore_table'], get_query_tables( "INSERT into `0070_insert_ignore_table` VALUES (9, '123', '2017-01-01');" ))
def test_get_query_tables(): assert ["test_table"] == get_query_tables("SELECT * FROM `test_table`") assert ["test_table", "second_table"] == get_query_tables( "SELECT foo FROM test_table, second_table WHERE id = 1" )