예제 #1
0
def test_redshift():
    assert get_query_tables(
        "ALTER TABLE target_table APPEND FROM source_table") == [
            "target_table",
            "source_table",
        ]
    assert get_query_tables("ALTER TABLE x APPEND FROM y") == ["x", "y"]
예제 #2
0
def test_insert_overwrite_table():
    assert ["foo_report"] == get_query_tables("INSERT TABLE foo_report")
    assert ["foo_report"
            ] == get_query_tables("INSERT OVERWRITE TABLE foo_report")
    assert ["foo_report", "bar"] == get_query_tables(
        "INSERT OVERWRITE TABLE foo_report SELECT foo FROM bar")

    assert ["foo"] == get_query_columns(
        "INSERT OVERWRITE TABLE foo_report SELECT foo FROM bar")
예제 #3
0
def test_select_aliases():
    assert get_query_tables('SELECT e.foo FROM bar AS e') == ['bar']
    assert get_query_tables('SELECT e.foo FROM bar e') == ['bar']
    assert get_query_tables('SELECT e.foo FROM (SELECT * FROM bar) AS e') == [
        'bar'
    ]
    assert get_query_tables('SELECT e.foo FROM (SELECT * FROM bar) e') == [
        'bar'
    ]
예제 #4
0
def test_insert_overwrite_table():
    assert ['foo_report'] == get_query_tables('INSERT TABLE foo_report')
    assert ['foo_report'
            ] == get_query_tables('INSERT OVERWRITE TABLE foo_report')
    assert ['foo_report', 'bar'] == get_query_tables(
        'INSERT OVERWRITE TABLE foo_report SELECT foo FROM bar')

    assert ['foo'] == get_query_columns(
        'INSERT OVERWRITE TABLE foo_report SELECT foo FROM bar')
예제 #5
0
def test_select_aliases():
    assert get_query_tables("SELECT e.foo FROM bar AS e") == ["bar"]
    assert get_query_tables("SELECT e.foo FROM bar e") == ["bar"]
    assert get_query_tables("SELECT e.foo FROM (SELECT * FROM bar) AS e") == [
        "bar"
    ]
    assert get_query_tables("SELECT e.foo FROM (SELECT * FROM bar) e") == [
        "bar"
    ]
예제 #6
0
def test_unions():
    # @see https://github.com/macbre/sql-metadata/issues/79
    assert ["tab1", "tab2"] == get_query_tables(
        "select col1, col2, col3 from tab1 union all select col4, col5, col6 from tab2"
    )

    # @see https://github.com/macbre/sql-metadata/issues/94
    assert [
        "d", "g"
    ] == get_query_tables("SELECT a,b,c FROM d UNION ALL SELECT e,f FROM g")
예제 #7
0
def test_joins():
    assert ['redirect', 'page'] == \
        get_query_tables("SELECT  page_title  FROM `redirect` INNER JOIN `page` "
                         "ON (rd_title = 'foo' AND rd_namespace = '100' AND (page_id = rd_from))")

    assert ['redirect', 'page'] == \
        get_query_tables("SELECT  page_title  FROM `redirect` INNER JOIN `page` `foo` "
                         "ON (rd_title = 'foo' AND rd_namespace = '100' AND (foo.page_id = rd_from))")

    assert ['page_title', 'rd_title', 'rd_namespace', 'page_id', 'rd_from'] == \
        get_query_columns("SELECT  page_title  FROM `redirect` INNER JOIN `page` "
                          "ON (rd_title = 'foo' AND rd_namespace = '100' AND (page_id = rd_from))")
예제 #8
0
def test_insert_into_select():
    # https://dev.mysql.com/doc/refman/5.7/en/insert-select.html
    query = "INSERT INTO foo SELECT * FROM bar"
    assert get_query_tables(query) == ["foo", "bar"]
    assert get_query_columns(query) == ["*"]

    query = "INSERT INTO foo SELECT id, price FROM bar"
    assert get_query_tables(query) == ["foo", "bar"]
    assert get_query_columns(query) == ["id", "price"]

    query = "INSERT INTO foo SELECT id, price FROM bar WHERE qty > 200"
    assert get_query_tables(query) == ["foo", "bar"]
    assert get_query_columns(query) == ["id", "price", "qty"]
예제 #9
0
def test_datasets():
    # see https://github.com/macbre/sql-metadata/issues/38
    assert get_query_tables(
        "SELECT A.FIELD1, B.FIELD1, (A.FIELD1 * B.FIELD1) AS QTY FROM TABLE1 AS A, TABLE2 AS B"
    ) == ["TABLE1", "TABLE2"]

    assert get_query_tables(
        "SELECT A.FIELD1, B.FIELD1, (A.FIELD1 * B.FIELD1) AS QTY FROM DATASET1.TABLE1, DATASET2.TABLE2"
    ) == ["DATASET1.TABLE1", "DATASET2.TABLE2"]

    assert get_query_tables(
        "SELECT A.FIELD1, B.FIELD1, (A.FIELD1 * B.FIELD1) AS QTY FROM DATASET1.TABLE1 AS A, DATASET2.TABLE2 AS B"
    ) == ["DATASET1.TABLE1", "DATASET2.TABLE2"]
예제 #10
0
def test_insert_into_select():
    # https://dev.mysql.com/doc/refman/5.7/en/insert-select.html
    query = "INSERT INTO foo SELECT * FROM bar"
    assert get_query_tables(query) == ['foo', 'bar']
    assert get_query_columns(query) == ['*']

    query = "INSERT INTO foo SELECT id, price FROM bar"
    assert get_query_tables(query) == ['foo', 'bar']
    assert get_query_columns(query) == ['id', 'price']

    query = "INSERT INTO foo SELECT id, price FROM bar WHERE qty > 200"
    assert get_query_tables(query) == ['foo', 'bar']
    assert get_query_columns(query) == ['id', 'price', 'qty']
예제 #11
0
def test_table_name_with_group_by():
    expected_tables = ['SH.sales']

    assert get_query_tables(
        "SELECT s.cust_id,count(s.cust_id) FROM SH.sales s") == expected_tables

    assert get_query_tables(
        "SELECT s.cust_id,count(s.cust_id) FROM SH.sales s GROUP BY s.cust_id"
    ) == expected_tables

    assert get_query_tables("""
SELECT s.cust_id,count(s.cust_id) FROM SH.sales s
GROUP BY s.cust_id HAVING s.cust_id != '1660' AND s.cust_id != '2'
    """.strip()) == expected_tables
예제 #12
0
def parse_sql(params):
    '''
        parse sql
    '''
    files = get_files_in_path(params)
    result = {}
    for filepath, filename in files:
        with open(filepath, 'r') as f:
            sql_data = f.read()

        get_tables = sql_metadata.get_query_tables(sql_data)
        source_table = filename.replace('.sql', '')

        local_depends = {}

        for table in get_tables:
            if '.' in table:
                table_name = table.split('.')[1]
            else:
                table_name = table

            local_depends[table_name] = None

        result[source_table] = local_depends

    #print(result)
    return result
예제 #13
0
def test_complex_hive_query():
    # https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DML#LanguageManualDML-InsertingdataintoHiveTablesfromqueries
    dag = """
INSERT OVERWRITE TABLE foo_report
SELECT
  d.domain, r.wiki_id, r.beacon, r.pageviews
FROM
  (SELECT wiki_id, beacon, sum(pageviews) AS pageviews FROM rollup_wiki_beacon_pageviews
   WHERE period_id = '1' AND
     (
       year > '{{ beginYear28 }}' OR
       (year = '{{ beginYear28 }}' AND month > '{{ beginMonth28 }}') OR
       (year = '{{ beginYear28 }}' AND month = '{{ beginMonth28 }}' AND day > '{{ beginDay28 }}')
     ) AND (
       year < '{{ beginYear }}' OR
       (year = '{{ beginYear }}' AND month < '{{ beginMonth }}') OR
       (year = '{{ beginYear }}' AND month = '{{ beginMonth }}' AND day <= '{{ beginDay }}')
     )
   GROUP BY wiki_id, beacon) r
JOIN statsdb.dimension_wikis d ON r.wiki_id = d.wiki_id;
    """

    assert [
        'foo_report', 'rollup_wiki_beacon_pageviews', 'statsdb.dimension_wikis'
    ] == get_query_tables(dag)
예제 #14
0
def test_with_brackets():
    assert ["database1.table1", "database2.table2"] == get_query_tables("""
        SELECT
          "xxxxx"
        FROM
          (database1.table1 alias
        LEFT JOIN database2.table2 ON ("tt"."ttt"."fff" = "xx"."xxx"))
        """)

    assert ["inner_table"] == get_query_tables("""
        SELECT
            t.foo
        FROM
            (SELECT foo FROM inner_table
            WHERE bar = '1') t
        """)
def check_high_offset_selects(_, queries):
    """
    :type _ indexdigest.database.Database
    :type queries list[str]
    :rtype: list[LinterEntry]
    """
    for query in queries:
        # ignore insert queries (#140)
        if not is_select_query(query):
            continue

        res = get_query_limit_and_offset(query)

        if res is None:
            continue

        (limit, offset) = res

        if offset < OFFSET_THRESHOLD:
            continue

        table_name = get_query_tables(query)[0]

        context = OrderedDict()
        context['query'] = query
        context['limit'] = limit
        context['offset'] = offset

        yield LinterEntry(linter_type='high_offset_selects', table_name=table_name,
                          message='"{}" query uses too high offset impacting the performance'.
                          format(shorten_query(query)),
                          context=context)
예제 #16
0
def get_portability_metrics_query(logline, job_name):
    """
    :type logline str
    :type job_name str
    :rtype: list[str, str, str]
    """
    # extract SQL query
    try:
        sql = str(re.search(r'INFO (.*)$', logline).group(1).strip())
    except AttributeError:
        return

    # remove "SQL: " prefix
    sql = sql.replace('SQL:', '').lstrip()

    # script name
    # portability-metric-metric-article-provider-py-1548232800
    script_name = str(
        re.match(r'portability-metric-([a-z-]+)-\d+', job_name).group(1))
    script_name = script_name.replace('-py', '.py')

    # SELECT, UPDATE, ...
    query_type = sql.split(' ')[0].upper()

    # get query metadata
    for table in get_query_tables(sql):
        # ignore one letter table aliases
        if len(table) == 1:
            continue

        yield script_name, query_type, table
예제 #17
0
def test_case_syntax():
    # https://dev.mysql.com/doc/refman/8.0/en/case.html
    assert get_query_columns(
        'select case when p > 0 then 1 else 0 end as cs from c where g > f'
    ) == ['p', 'g', 'f']
    assert get_query_tables(
        'select case when p > 0 then 1 else 0 end as cs from c where g > f'
    ) == ['c']
예제 #18
0
def test_case_syntax():
    # https://dev.mysql.com/doc/refman/8.0/en/case.html
    assert get_query_columns(
        "select case when p > 0 then 1 else 0 end as cs from c where g > f"
    ) == ["p", "g", "f"]
    assert get_query_tables(
        "select case when p > 0 then 1 else 0 end as cs from c where g > f"
    ) == ["c"]
예제 #19
0
def test_joins():
    assert ['redirect', 'page'] == \
        get_query_tables("SELECT  page_title  FROM `redirect` INNER JOIN `page` "
                         "ON (rd_title = 'foo' AND rd_namespace = '100' AND (page_id = rd_from))")

    assert ['redirect', 'page'] == \
        get_query_tables("SELECT  page_title  FROM `redirect` INNER JOIN `page` `foo` "
                         "ON (rd_title = 'foo' AND rd_namespace = '100' AND (foo.page_id = rd_from))")

    assert ['page_title', 'rd_title', 'rd_namespace', 'page_id', 'rd_from'] == \
        get_query_columns("SELECT  page_title  FROM `redirect` INNER JOIN `page` "
                          "ON (rd_title = 'foo' AND rd_namespace = '100' AND (page_id = rd_from))")

    # see #34
    assert ['foos', 'bars'] == \
        get_query_tables("SELECT foo FROM `foos` JOIN `bars` ON (foos.id = bars.id)")

    assert ['foos', 'bars'] == \
        get_query_tables("SELECT foo FROM `foos` FULL JOIN `bars` ON (foos.id = bars.id)")

    assert ['foos', 'bars'] == \
        get_query_tables("SELECT foo FROM `foos` FULL OUTER JOIN `bars` ON (foos.id = bars.id)")

    assert ['foos', 'bars'] == \
        get_query_tables("SELECT foo FROM `foos` RIGHT OUTER JOIN `bars` ON (foos.id = bars.id)")

    assert ['foos', 'bars'] == \
        get_query_tables("SELECT foo FROM `foos` LEFT OUTER JOIN `bars` ON (foos.id = bars.id)")
예제 #20
0
def test_case_insensitive():
    # case-insensitive handling
    # https://github.com/macbre/sql-metadata/issues/71
    assert ["abc.foo", "foo", "bar"] == get_query_tables(
        "create table abc.foo as SELECT pqr.foo1 , ab.foo2 FROM foo pqr, bar ab"
    )

    assert ["abc.foo", "foo", "bar"] == get_query_tables(
        "create table abc.foo as select pqr.foo1 , ab.foo2 FROM foo pqr, bar ab"
    )

    assert ["pqr.foo1", "ab.foo2"] == get_query_columns(
        "create table abc.foo as SELECT pqr.foo1 , ab.foo2 FROM foo pqr, bar ab"
    )

    assert ["pqr.foo1", "ab.foo2"] == get_query_columns(
        "create table abc.foo as select pqr.foo1 , ab.foo2 FROM foo pqr, bar ab"
    )
예제 #21
0
def test_handle_force_index():
    query = "SELECT  page_title,page_namespace  FROM `page` FORCE INDEX (page_random) " \
            "JOIN `categorylinks` ON ((page_id=cl_from))  WHERE page_is_redirect = '0' " \
            "AND (page_random >= 0.197372293871) AND cl_to = 'Muppet_Characters'  " \
            "ORDER BY page_random LIMIT 1"

    assert get_query_tables(query) == ['page', 'categorylinks']
    assert get_query_columns(query) == \
        ['page_title', 'page_namespace', 'page_id', 'cl_from', 'page_is_redirect', 'page_random', 'cl_to']
예제 #22
0
def test_tables_aliases_are_resolved():
    """
    See https://github.com/macbre/sql-metadata/issues/52
    """
    sql = "SELECT a.* FROM users1 AS a JOIN users2 AS b ON a.ip_address = b.ip_address"

    assert get_query_tables(sql) == ['users1', 'users2']
    assert get_query_table_aliases(sql) == {'a': 'users1', 'b': 'users2'}
    assert get_query_columns(sql) == [
        'users1.*', 'users1.ip_address', 'users2.ip_address'
    ], 'Should resolve table aliases'
def check_not_used_columns(database, queries):
    """
    :type database  indexdigest.database.Database
    :type queries list[str]
    :rtype: list[LinterEntry]
    :raises Exception
    """
    logger = logging.getLogger(__name__)

    # analyze only SELECT queries from the log
    queries = list(filter(is_select_query, queries))

    used_tables = get_used_tables_from_queries(queries)
    used_columns = defaultdict(list)

    logger.info("Will check these tables: %s", used_tables)

    # analyze given queries and collect used columns for each table
    for query in queries:
        tables = get_query_tables(query)
        if tables:
            columns = get_query_columns(query)

            # print(query, table, columns)

            # add used columns per table
            # FIXME: assume we're querying just a single table for now
            used_columns[tables[0]] += columns
        else:
            logger.error(
                'Unable to extract tables and columns used from the query: %s',
                query)

    # analyze table schemas and report not used columns for each table
    for table in used_tables:
        logger.info("Checking %s table", table)
        table_columns = database.get_table_columns(table)

        # now get the difference and report them
        not_used_columns = [
            column for column in table_columns
            if column.name not in set(used_columns[table])
        ] if table_columns else []

        for column in not_used_columns:
            yield LinterEntry(
                linter_type='not_used_columns',
                table_name=table,
                message='"{}" column was not used by provided queries'.format(
                    column),
                context={
                    'column_name': column.name,
                    'column_type': column.type
                })
예제 #24
0
def test_with_with():
    pytest.skip("Improve WITH syntax handling with a new parser (#98)")

    assert ["table3", "database2.table2"] == get_query_tables("""
        WITH
            database1.tableFromWith AS SELECT * FROM table3
        SELECT
          "xxxxx"
        FROM
          database1.tableFromWith alias
        LEFT JOIN database2.table2 ON ("tt"."ttt"."fff" = "xx"."xxx")
        """)
예제 #25
0
def test_queries_with_distinct():
    assert get_query_columns("SELECT DISTINCT DATA.ASSAY_ID FROM foo") == [
        "DATA.ASSAY_ID"
    ]

    assert get_query_columns("SELECT UNIQUE DATA.ASSAY_ID FROM foo") == [
        "DATA.ASSAY_ID"
    ]

    assert get_query_tables("SELECT DISTINCT DATA.ASSAY_ID FROM foo") == [
        "foo"
    ]
예제 #26
0
def test_tables_aliases_are_resolved():
    """
    See https://github.com/macbre/sql-metadata/issues/52
    """
    sql = "SELECT a.* FROM users1 AS a JOIN users2 AS b ON a.ip_address = b.ip_address"

    assert get_query_tables(sql) == ["users1", "users2"]
    assert get_query_table_aliases(sql) == {"a": "users1", "b": "users2"}
    assert get_query_columns(sql) == [
        "users1.*",
        "users1.ip_address",
        "users2.ip_address",
    ], "Should resolve table aliases"
def get_columns(sql_query: str) -> list:
    if sql_query is not None:
        tables: list = sql_metadata.get_query_tables(sql_query)
        columns: list = sql_metadata.get_query_columns(sql_query)

        if len(columns) == 1:
            #* Return all columns
            if len(columns) == 1 and columns[0] == '*':
                if tables[0] == 'public.store':
                    columns = TABLE_STORE_COLUMNS
                else:
                    columns = TABLE_USER_COLUMNS
        return columns
예제 #28
0
def test_sql_server_cte():
    """
    Tests support for SQL Server's common table expression (CTE).

    @see https://www.sqlservertutorial.net/sql-server-basics/sql-server-cte/
    """
    assert get_query_tables("""
WITH x AS (
    SELECT * FROM n
)
SELECT
    *
FROM x
JOIN y ON x.a = y.a
    """.strip()) == ['n', 'x', 'y']

    assert get_query_tables("""
WITH x AS (
    SELECT * FROM n
)
select
    *
FROM x
JOIN y ON x.a = y.a
    """.strip()) == ['n', 'x', 'y']

    assert get_query_tables("""
WITH foo AS (
    SELECT * FROM n
)
update z from foo set z.q = fpp.y 
    """.strip()) == ['n', 'z', 'foo']

    assert get_query_tables("""
WITH foo AS ( 
     SELECT * FROM tab
) 
DELETE FROM z JOIN foo ON z.a = foo.a  
    """.strip()) == ['tab', 'z', 'foo']
def get_used_tables_from_queries(queries):
    """
    :type queries list[str]
    :rtype: list[str]
    """
    used_tables = []
    queries = filter(is_select_query, queries)

    for query in queries:
        # parse each query from the log
        tables = get_query_tables(query)
        if tables and tables[0] not in used_tables:
            used_tables.append(tables[0])

    return used_tables
예제 #30
0
    def _get_sql_table_names(cls, sql: str) -> List[str]:
        sql_tables: List[str] = get_query_tables(sql)

        # Remove temporary tables from WITH statements
        sql_table_names = [
            t for t in sql_tables if not re.search(
                fr"WITH(.*,)?\s+{t}(\s*\([\w\s,]+\))?\s+AS\s+\(",
                sql,
                re.IGNORECASE | re.DOTALL,
            )
        ]

        # Remove quotes from tables
        sql_table_names = [t.replace('"', "") for t in sql_table_names]

        return sql_table_names