def test_parse_bugged_cte(): with pytest.raises(RuntimeError): SqlParser.parse( ''' WITH sum_trans ( SELECT user_id, COUNT(*) as cnt, SUM(amount) as balance FROM transactions WHERE created_date > '2020-01-01' GROUP BY user_id ) INSERT INTO potential_fraud (user_id, cnt, balance) SELECT user_id, cnt, balance FROM sum_trans WHERE count > 1000 OR balance > 100000; ''' )
def extract(self) -> StepMetadata: # (1) Parse sql statement to obtain input / output tables. sql_meta: SqlMeta = SqlParser.parse(self.operator.sql, self.default_schema) # (2) Default all inputs / outputs to current connection. # NOTE: We'll want to look into adding support for the `database` # property that is used to override the one defined in the connection. conn_id = self._conn_id() source = Source(type=self.source_type, name=conn_id, connection_url=get_connection_uri(conn_id)) # (3) Map input / output tables to dataset objects with source set # as the current connection. We need to also fetch the schema for the # input tables to format the dataset name as: # {schema_name}.{table_name} inputs = [ Dataset.from_table(source=source, table_name=in_table_schema.table_name.name, schema_name=in_table_schema.schema_name) for in_table_schema in self._get_table_schemas(sql_meta.in_tables) ] outputs = [ Dataset.from_table_schema(source=source, table_schema=out_table_schema) for out_table_schema in self._get_table_schemas(sql_meta.out_tables) ] return StepMetadata( name=f"{self.operator.dag_id}.{self.operator.task_id}", inputs=inputs, outputs=outputs, context={'sql': self.operator.sql})
def test_ignores_default_schema_when_non_default_schema(): sql_meta = SqlParser.parse( ''' SELECT col0, col1, col2 FROM transactions.table0 ''', 'public' ) assert sql_meta.in_tables == [DbTableName('transactions.table0')]
def test_parse_default_schema(): sql_meta = SqlParser.parse( ''' SELECT col0, col1, col2 FROM table0 ''', 'public' ) assert sql_meta.in_tables == [DbTableName('public.table0')]
def test_parse_simple_select_with_table_schema_prefix_and_extra_whitespace(): sql_meta = SqlParser.parse( ''' SELECT * FROM schema0.table0 ; ''' ) assert sql_meta.in_tables == [DbTableName('schema0.table0')] assert sql_meta.out_tables == []
def test_parse_simple_insert_into(): sql_meta = SqlParser.parse( ''' INSERT INTO table0 (col0, col1, col2) VALUES (val0, val1, val2); ''' ) assert sql_meta.in_tables == [] assert sql_meta.out_tables == [DbTableName('table0')]
def test_parse_simple_select(): sql_meta = SqlParser.parse( ''' SELECT * FROM table0; ''' ) log.debug("sqlparser.parse() successful.") assert sql_meta.in_tables == [DbTableName('table0')] assert sql_meta.out_tables == []
def test_parse_simple_insert_into_select(): sql_meta = SqlParser.parse( ''' INSERT INTO table1 (col0, col1, col2) SELECT col0, col1, col2 FROM table0; ''' ) assert sql_meta.in_tables == [DbTableName('table0')] assert sql_meta.out_tables == [DbTableName('table1')]
def test_parse_simple_select_into(): sql_meta = SqlParser.parse( ''' SELECT * INTO table0 FROM table1; ''' ) assert sql_meta.in_tables == [DbTableName('table1')] assert sql_meta.out_tables == [DbTableName('table0')]
def test_parse_simple_right_outer_join(): sql_meta = SqlParser.parse( ''' SELECT col0, col1, col2 FROM table0 RIGHT OUTER JOIN table1 ON t1.col0 = t2.col0; ''' ) assert sql_meta.in_tables == [DbTableName('table0'), DbTableName('table1')] assert sql_meta.out_tables == []
def test_parse_simple_inner_join(): sql_meta = SqlParser.parse( ''' SELECT col0, col1, col2 FROM table0 INNER JOIN table1 ON t1.col0 = t2.col0 ''' ) assert set(sql_meta.in_tables) == {DbTableName('table0'), DbTableName('table1')} assert sql_meta.out_tables == []
def test_parse_simple_left_join(): sql_meta = SqlParser.parse( ''' SELECT col0, col1, col2 FROM table0 LEFT JOIN table1 ON t1.col0 = t2.col0 ''' ) assert sql_meta.in_tables == [DbTableName('table0'), DbTableName('table1')] assert sql_meta.out_tables == []
def test_parser_integration(): sql_meta = SqlParser.parse( """ INSERT INTO popular_orders_day_of_week (order_day_of_week, order_placed_on,orders_placed) SELECT EXTRACT(ISODOW FROM order_placed_on) AS order_day_of_week, order_placed_on, COUNT(*) AS orders_placed FROM top_delivery_times GROUP BY order_placed_on; """, "public" ) assert sql_meta.in_tables == [DbTableName('public.top_delivery_times')]
def test_parse_simple_cte(): sql_meta = SqlParser.parse( ''' WITH sum_trans as ( SELECT user_id, COUNT(*) as cnt, SUM(amount) as balance FROM transactions WHERE created_date > '2020-01-01' GROUP BY user_id ) INSERT INTO potential_fraud (user_id, cnt, balance) SELECT user_id, cnt, balance FROM sum_trans WHERE count > 1000 OR balance > 100000; ''' ) assert sql_meta.in_tables == [DbTableName('transactions')] assert sql_meta.out_tables == [DbTableName('potential_fraud')]
def parse_sql_context(self): context = { 'sql': self.operator.sql, } try: sql_meta = SqlParser.parse(self.operator.sql) log.debug(f"bigquery sql parsed and obtained meta: {sql_meta}") context['bigquery.sql.parsed.inputs'] = json.dumps( [in_table.name for in_table in sql_meta.in_tables]) context['bigquery.sql.parsed.outputs'] = json.dumps( [out_table.name for out_table in sql_meta.out_tables]) except Exception as e: log.error(f"Cannot parse sql query. {e}", exc_info=True) context['bigquery.extractor.sql_parser_error'] = \ f'{e}: {traceback.format_exc()}' self.log.info(context) return context
def parse_sql_context(self) -> SqlContext: try: sql_meta = SqlParser.parse(self.operator.sql, None) log.debug(f"bigquery sql parsed and obtained meta: {sql_meta}") return SqlContext( sql=self.operator.sql, inputs=json.dumps( [in_table.name for in_table in sql_meta.in_tables] ), outputs=json.dumps( [out_table.name for out_table in sql_meta.out_tables] ) ) except Exception as e: log.error(f"Cannot parse sql query. {e}", exc_info=True) return SqlContext( sql=self.operator.sql, parser_error=f'{e}: {traceback.format_exc()}' )
def test_parse_recursive_cte(): sql_meta = SqlParser.parse( ''' WITH RECURSIVE subordinates AS (SELECT employee_id, manager_id, full_name FROM employees WHERE employee_id = 2 UNION SELECT e.employee_id, e.manager_id, e.full_name FROM employees e INNER JOIN subordinates s ON s.employee_id = e.manager_id) INSERT INTO sub_employees (employee_id, manager_id, full_name) SELECT employee_id, manager_id, full_name FROM subordinates; ''' ) assert sql_meta.in_tables == [DbTableName('employees')] assert sql_meta.out_tables == [DbTableName('sub_employees')]
def test_tpcds_cte_query(): sql_meta = SqlParser.parse(""" WITH year_total AS (SELECT c_customer_id customer_id, c_first_name customer_first_name, c_last_name customer_last_name, c_preferred_cust_flag customer_preferred_cust_flag, c_birth_country customer_birth_country, c_login customer_login, c_email_address customer_email_address, d_year dyear, Sum(((ss_ext_list_price - ss_ext_wholesale_cost - ss_ext_discount_amt) + ss_ext_sales_price) / 2) year_total, 's' sale_type FROM src.customer, store_sales, date_dim WHERE c_customer_sk = ss_customer_sk AND ss_sold_date_sk = d_date_sk GROUP BY c_customer_id, c_first_name, c_last_name, c_preferred_cust_flag, c_birth_country, c_login, c_email_address, d_year) SELECT t_s_secyear.customer_id, t_s_secyear.customer_first_name, t_s_secyear.customer_last_name, t_s_secyear.customer_preferred_cust_flag FROM year_total t_s_firstyear, year_total t_s_secyear, year_total t_c_firstyear, year_total t_c_secyear, year_total t_w_firstyear, year_total t_w_secyear WHERE t_s_secyear.customer_id = t_s_firstyear.customer_id AND t_s_firstyear.customer_id = t_c_secyear.customer_id AND t_s_firstyear.customer_id = t_c_firstyear.customer_id AND t_s_firstyear.customer_id = t_w_firstyear.customer_id AND t_s_firstyear.customer_id = t_w_secyear.customer_id AND t_s_firstyear.sale_type = 's' AND t_c_firstyear.sale_type = 'c' AND t_w_firstyear.sale_type = 'w' AND t_s_secyear.sale_type = 's' AND t_c_secyear.sale_type = 'c' AND t_w_secyear.sale_type = 'w' AND t_s_firstyear.dyear = 2001 AND t_s_secyear.dyear = 2001 + 1 AND t_c_firstyear.dyear = 2001 AND t_c_secyear.dyear = 2001 + 1 AND t_w_firstyear.dyear = 2001 AND t_w_secyear.dyear = 2001 + 1 AND t_s_firstyear.year_total > 0 AND t_c_firstyear.year_total > 0 AND t_w_firstyear.year_total > 0 AND CASE WHEN t_c_firstyear.year_total > 0 THEN t_c_secyear.year_total / t_c_firstyear.year_total ELSE NULL END > CASE WHEN t_s_firstyear.year_total > 0 THEN t_s_secyear.year_total / t_s_firstyear.year_total ELSE NULL END AND CASE WHEN t_c_firstyear.year_total > 0 THEN t_c_secyear.year_total / t_c_firstyear.year_total ELSE NULL END > CASE WHEN t_w_firstyear.year_total > 0 THEN t_w_secyear.year_total / t_w_firstyear.year_total ELSE NULL END ORDER BY t_s_secyear.customer_id, t_s_secyear.customer_first_name, t_s_secyear.customer_last_name, t_s_secyear.customer_preferred_cust_flag LIMIT 100; """) assert set(sql_meta.in_tables) == { DbTableName("src.customer"), DbTableName("store_sales"), DbTableName("date_dim") } assert len(sql_meta.out_tables) == 0