def _handle_column_save(self, token: SQLToken, columns: List[str]): column = token.table_prefixed_column(self.tables_aliases) if self._is_with_query_already_resolved(column): self._add_to_columns_aliases_subsection(token=token, left_expand=False) token.token_type = TokenType.COLUMN_ALIAS return column = self._resolve_sub_queries(column) self._add_to_columns_with_tables(token, column) self._add_to_columns_subsection( keyword=token.last_keyword_normalized, column=column ) token.token_type = TokenType.COLUMN columns.extend(column)
def tokens(self) -> List[SQLToken]: """ Tokenizes the query """ if self._tokens is not None: return self._tokens parsed = sqlparse.parse(self._query) tokens = [] # handle empty queries (#12) if not parsed: return tokens self.sqlparse_tokens = parsed[0].tokens sqlparse_tokens = self._flatten_sqlparse() non_empty_tokens = [ token for token in sqlparse_tokens if token.ttype is not Whitespace and token.ttype.parent is not Whitespace ] last_keyword = None for index, tok in enumerate(non_empty_tokens): token = SQLToken( tok=tok, index=index, subquery_level=self._subquery_level, last_keyword=last_keyword, ) if index > 0: # create links between consecutive tokens token.previous_token = tokens[index - 1] tokens[index - 1].next_token = token if token.is_left_parenthesis: self._determine_opening_parenthesis_type(token=token) elif token.is_right_parenthesis: self._determine_closing_parenthesis_type(token=token) if tok.is_keyword and "".join(tok.normalized.split()) in RELEVANT_KEYWORDS: last_keyword = tok.normalized token.is_in_nested_function = self._is_in_nested_function token.parenthesis_level = self._parenthesis_level tokens.append(token) self._tokens = tokens # since tokens are used in all methods required parsing (so w/o generalization) # we set the query type here (and not in init) to allow for generalization # but disallow any other usage for not supported queries to avoid unexpected # results which are not really an error _ = self.query_type return tokens
def _determine_opening_parenthesis_type(self, token: SQLToken): """ Determines the type of left parenthesis in query """ if token.previous_token.normalized in SUBQUERY_PRECEDING_KEYWORDS: # inside subquery / derived table token.is_subquery_start = True self._subquery_level += 1 token.subquery_level = self._subquery_level elif token.previous_token.normalized in KEYWORDS_BEFORE_COLUMNS.union({","}): # we are in columns and in a column subquery definition token.is_column_definition_start = True elif token.previous_token.normalized == "AS": token.is_with_query_start = True elif ( token.get_nth_previous(2).normalized == "TABLE" and token.get_nth_previous(3).normalized == "CREATE" ): token.is_create_table_columns_declaration_start = True else: # nested function token.is_nested_function_start = True self._nested_level += 1 self._is_in_nested_function = True self._open_parentheses.append(token) self._parenthesis_level += 1
def _determine_opening_parenthesis_type(self, token: SQLToken): """ Determines the type of left parenthesis in query """ if token.previous_token.normalized in SUBQUERY_PRECEDING_KEYWORDS: # inside subquery / derived table token.is_subquery_start = True self._subquery_level += 1 token.subquery_level = self._subquery_level elif token.previous_token.normalized in KEYWORDS_BEFORE_COLUMNS.union({","}): # we are in columns and in a column subquery definition token.is_column_definition_start = True elif ( token.previous_token.is_as_keyword and token.last_keyword_normalized != "WINDOW" ): # window clause also contains AS keyword, but it is not a query token.is_with_query_start = True elif ( token.last_keyword_normalized == "TABLE" and token.find_nearest_token("(") is EmptyToken ): token.is_create_table_columns_declaration_start = True elif token.previous_token.normalized == "OVER": token.is_partition_clause_start = True else: # nested function token.is_nested_function_start = True self._nested_level += 1 self._is_in_nested_function = True self._open_parentheses.append(token) self._parenthesis_level += 1
def _handle_with_name_save(token: SQLToken, with_names: List[str]) -> None: if token.is_right_parenthesis: # inside columns of with statement # like: with (col1, col2) as (subquery) token.is_with_columns_end = True token.is_nested_function_end = False start_token = token.find_nearest_token("(") start_token.is_with_columns_start = True start_token.is_nested_function_start = False prev_token = start_token.previous_token prev_token.token_type = TokenType.WITH_NAME with_names.append(prev_token.value) else: token.token_type = TokenType.WITH_NAME with_names.append(token.value)
def _handle_column_alias_subquery_level_update(self, token: SQLToken) -> None: token.token_type = TokenType.COLUMN_ALIAS self._add_to_columns_aliases_subsection(token=token) current_level = self._column_aliases_max_subquery_level.setdefault( token.value, 0 ) if token.subquery_level > current_level: self._column_aliases_max_subquery_level[token.value] = token.subquery_level
def _determine_closing_parenthesis_type(self, token: SQLToken): """ Determines the type of right parenthesis in query """ last_open_parenthesis = self._open_parentheses.pop(-1) if last_open_parenthesis.is_subquery_start: token.is_subquery_end = True self._subquery_level -= 1 elif last_open_parenthesis.is_column_definition_start: token.is_column_definition_end = True elif last_open_parenthesis.is_with_query_start: token.is_with_query_end = True else: token.is_nested_function_end = True self._nested_level -= 1 if self._nested_level == 0: self._is_in_nested_function = False
def _combine_qualified_names(self, index: int, token: SQLToken) -> None: """ Combines names like <schema>.<table>.<column> or <table/sub_query>.<column> """ value = token.value is_complex = True while is_complex: value, is_complex = self._combine_tokens(index=index, value=value) index = index - 2 token.value = value
def tokens(self) -> List[SQLToken]: """ Tokenizes the query """ if self._tokens is not None: return self._tokens parsed = sqlparse.parse(self.query) tokens = [] # handle empty queries (#12) if not parsed: return tokens sqlparse_tokens = TokenList(parsed[0].tokens).flatten() non_empty_tokens = [ token for token in sqlparse_tokens if token.ttype is not Whitespace ] last_keyword = None for index, tok in enumerate(non_empty_tokens): token = SQLToken( tok=tok, index=index, subquery_level=self._subquery_level, last_keyword=last_keyword, ) if index > 0: # create links between consecutive tokens token.previous_token = tokens[index - 1] tokens[index - 1].next_token = token if token.is_left_parenthesis: self._determine_opening_parenthesis_type(token=token) elif token.is_right_parenthesis: self._determine_closing_parenthesis_type(token=token) if tok.is_keyword and tok.normalized not in KEYWORDS_IGNORED: last_keyword = tok.normalized token.is_in_nested_function = self._is_in_nested_function tokens.append(token) self._tokens = tokens return tokens
def _determine_opening_parenthesis_type(self, token: SQLToken): """ Determines the type of left parenthesis in query """ if token.previous_token.normalized in SUBQUERY_PRECEDING_KEYWORDS: # inside subquery / derived table token.is_subquery_start = True self._subquery_level += 1 token.subquery_level = self._subquery_level elif token.previous_token.normalized in KEYWORDS_BEFORE_COLUMNS + [","]: # we are in columns and in a column subquery definition token.is_column_definition_start = True elif token.previous_token.normalized == "AS": token.is_with_query_start = True else: # nested function token.is_nested_function_start = True self._nested_level += 1 self._is_in_nested_function = True self._open_parentheses.append(token)
def tokens(self) -> List[SQLToken]: """ Tokenizes the query """ if self._tokens is not None: return self._tokens parsed = sqlparse.parse(self._query) tokens = [] # handle empty queries (#12) if not parsed: return tokens self._get_sqlparse_tokens(parsed) last_keyword = None combine_flag = False for index, tok in enumerate(self.non_empty_tokens): # combine dot separated identifiers if self._is_token_part_of_complex_identifier(token=tok, index=index): combine_flag = True continue token = SQLToken( tok=tok, index=index, subquery_level=self._subquery_level, last_keyword=last_keyword, ) if combine_flag: self._combine_qualified_names(index=index, token=token) combine_flag = False previous_token = tokens[-1] if index > 0 else EmptyToken token.previous_token = previous_token previous_token.next_token = token if index > 0 else None if token.is_left_parenthesis: token.token_type = TokenType.PARENTHESIS self._determine_opening_parenthesis_type(token=token) elif token.is_right_parenthesis: token.token_type = TokenType.PARENTHESIS self._determine_closing_parenthesis_type(token=token) last_keyword = self._determine_last_relevant_keyword( token=token, last_keyword=last_keyword ) token.is_in_nested_function = self._is_in_nested_function token.parenthesis_level = self._parenthesis_level tokens.append(token) self._tokens = tokens # since tokens are used in all methods required parsing (so w/o generalization) # we set the query type here (and not in init) to allow for generalization # but disallow any other usage for not supported queries to avoid unexpected # results which are not really an error _ = self.query_type return tokens
def _add_to_columns_aliases_subsection(self, token: SQLToken): """ Add alias to the section in which it appears in query """ keyword = token.last_keyword_normalized alias = token.left_expanded if ( token.last_keyword_normalized in ["FROM", "WITH"] and token.find_nearest_token("(").is_with_columns_start ): keyword = "SELECT" section = COLUMNS_SECTIONS[keyword] self._columns_aliases_dict = self._columns_aliases_dict or dict() self._columns_aliases_dict.setdefault(section, UniqueList()).append(alias)
def _resolve_function_alias(self, token: SQLToken) -> Union[str, List[str]]: # it can be one function or a chain of functions # like: sum(a) + sum(b) as alias # or operation on columns like: col1 + col2 as alias start_token = token.find_nearest_token( [",", "SELECT"], value_attribute="normalized" ) while start_token.is_in_nested_function: start_token = start_token.find_nearest_token( [",", "SELECT"], value_attribute="normalized" ) return self._find_all_columns_between_tokens( start_token=start_token, end_token=token )
def _determine_last_relevant_keyword(self, token: SQLToken, last_keyword: str): if token.is_keyword and "".join(token.normalized.split()) in RELEVANT_KEYWORDS: if ( not ( token.normalized == "FROM" and token.get_nth_previous(3).normalized == "EXTRACT" ) and not ( token.normalized == "ORDERBY" and len(self._open_parentheses) > 0 and self._open_parentheses[-1].is_partition_clause_start ) and not (token.normalized == "USING" and last_keyword == "SELECT") ): last_keyword = token.normalized return last_keyword
def _find_column_for_with_column_alias(self, token: SQLToken) -> str: start_token = token.find_nearest_token( True, direction="right", value_attribute="is_with_query_start" ) if start_token not in self._with_columns_candidates: end_token = start_token.find_nearest_token( True, direction="right", value_attribute="is_with_query_end" ) columns = self._find_all_columns_between_tokens( start_token=start_token, end_token=end_token ) self._with_columns_candidates[start_token] = columns if isinstance(self._with_columns_candidates[start_token], list): alias_of = self._with_columns_candidates[start_token].pop(0) else: alias_of = self._with_columns_candidates[start_token] return alias_of
def _resolve_subquery_alias(self, token: SQLToken) -> Union[str, List[str]]: # nested subquery like select a, (select a as b from x) as column start_token = token.find_nearest_token( True, value_attribute="is_column_definition_start" ) if start_token.next_token.normalized == "SELECT": # we have a subquery alias_token = start_token.next_token.find_nearest_token( self._aliases_to_check, direction="right", value_attribute="value", ) return self._resolve_alias_to_column(alias_token) # chain of functions or redundant parenthesis return self._find_all_columns_between_tokens( start_token=start_token, end_token=token )