def select_parsing(query): selectStmt = Forward() selectStmt <<= (SELECT + ("*" | columnNameList)("columnsToShow") + FROM + tableNameList("tables") + Optional(Group(WHERE + whereExpression), "")("where")) selectStmt.ignore(oracleSqlComment) return selectStmt.parseString(query)
def build_parser(root_directory, path, fake_root=os.getcwd(), file_reader=None): from pyparsing import nestedExpr from pyparsing import QuotedString from pyparsing import Group from pyparsing import restOfLine from pyparsing import Word from pyparsing import alphanums from pyparsing import cStyleComment from pyparsing import OneOrMore from pyparsing import ZeroOrMore from pyparsing import Optional from pyparsing import Forward from pyparsing import Literal from pyparsing import Keyword root = Forward() include_handler = IncludeHandler( root_directory, path, root, fake_root=fake_root, file_reader=file_reader) # relaxed grammar identifier = Word(alphanums + "-_.:/") comment = ("//" + restOfLine).suppress() \ | ("#" + restOfLine).suppress() \ | cStyleComment endstmt = Literal(";").suppress() argument = QuotedString('"') \ | identifier arguments = ZeroOrMore(argument) statements = Forward() section = nestedExpr("{", "}", statements) include = Keyword("include").suppress() + QuotedString('"') regular = identifier + Group(arguments) + Optional(section, default=[]) statement = include.setParseAction(include_handler.pyparsing_call) \ | regular.setParseAction(include_handler.pyparsing_mark) statements << OneOrMore(statement + endstmt) root << Optional(statements) root.ignore(comment) setattr( root, 'parse_file', lambda f, root=root: root.parseFile(f, parseAll=True)) return root
def delete_query_parse(query): deleteStmt = Forward() # define the grammar # e.g the rules to parse an sql select query deleteStmt <<= (DELETE + FROM + tableName("tableName") + Optional(Group(WHERE + whereExpression), "")("where")) deleteStmt.ignore(oracleSqlComment) return deleteStmt.parseString(query)
def create_db_parse(query): # Forward declaration of selectStmt to define it later createDBStmt = Forward() # define the grammar # e.g the rules to parse an sql select query createDBStmt <<= (CREATE + DATABASE + Optional(IF_NOT_EXISTS)("existence_clause") + ident("dbName")) createDBStmt.ignore(oracleSqlComment) return createDBStmt.parseString(query)
def insert_query_parse(query): insertStmt = Forward() value = (realNum | intNum | quotedString | dblQuotedString) values = Group(delimitedList(value)) valuesWithParenthesis = "(" + values + ")" valuesList = Group(delimitedList(valuesWithParenthesis)) # define the grammar insertStmt <<= (INSERT + INTO + tableName("tableName") + VALUES + valuesList("valuesList")) insertStmt.ignore(oracleSqlComment) return insertStmt.parseString(query)
def create_table_query_parse(query): createTableStmt = Forward() charType = Combine(CHAR + "(" + intNum + ")") varcharType = Combine(VARCHAR + "(" + intNum + ")") columnTypeName = (INT | FLOAT | charType | varcharType | DATE | DATETIME | TIME | YEAR) column = Group(columnName + columnTypeName + Optional(NOT_NULL | NULL)) # Token for a sublist of column name columnList = Group(delimitedList(column)) # define the grammar # e.g the rules to parse an sql select query createTableStmt <<= (CREATE + TABLE + tableName("tableName") + "(" + columnList("columnList") + ")") createTableStmt.ignore(oracleSqlComment) return createTableStmt.parseString(query) return tableName
def _get_parser(cls): if cls._parser is not None: return cls._parser ParserElement.enablePackrat() LPAR, RPAR, COMMA, LBRACKET, RBRACKET, LT, GT = map(Literal, "(),[]<>") ungrouped_select_stmt = Forward().setName("select statement") # keywords ( UNION, ALL, AND, INTERSECT, EXCEPT, COLLATE, ASC, DESC, ON, USING, NATURAL, INNER, CROSS, LEFT, RIGHT, OUTER, FULL, JOIN, AS, INDEXED, NOT, SELECT, DISTINCT, FROM, WHERE, GROUP, BY, HAVING, ORDER, BY, LIMIT, OFFSET, OR, CAST, ISNULL, NOTNULL, NULL, IS, BETWEEN, ELSE, END, CASE, WHEN, THEN, EXISTS, COLLATE, IN, LIKE, GLOB, REGEXP, MATCH, ESCAPE, CURRENT_TIME, CURRENT_DATE, CURRENT_TIMESTAMP, WITH, EXTRACT, PARTITION, ROWS, RANGE, UNBOUNDED, PRECEDING, CURRENT, ROW, FOLLOWING, OVER, INTERVAL, DATE_ADD, DATE_SUB, ADDDATE, SUBDATE, REGEXP_EXTRACT, SPLIT, ORDINAL, FIRST_VALUE, LAST_VALUE, NTH_VALUE, LEAD, LAG, PERCENTILE_CONT, PRECENTILE_DISC, RANK, DENSE_RANK, PERCENT_RANK, CUME_DIST, NTILE, ROW_NUMBER, DATE, TIME, DATETIME, TIMESTAMP, UNNEST, INT64, NUMERIC, FLOAT64, BOOL, BYTES, GEOGRAPHY, ARRAY, STRUCT, SAFE_CAST, ANY_VALUE, ARRAY_AGG, ARRAY_CONCAT_AGG, AVG, BIT_AND, BIT_OR, BIT_XOR, COUNT, COUNTIF, LOGICAL_AND, LOGICAL_OR, MAX, MIN, STRING_AGG, SUM, CORR, COVAR_POP, COVAR_SAMP, STDDEV_POP, STDDEV_SAMP, STDDEV, VAR_POP, VAR_SAMP, VARIANCE, TIMESTAMP_ADD, TIMESTAMP_SUB, GENERATE_ARRAY, GENERATE_DATE_ARRAY, GENERATE_TIMESTAMP_ARRAY, FOR, SYSTEMTIME, AS, OF, WINDOW, RESPECT, IGNORE, NULLS, ) = map( CaselessKeyword, """ UNION, ALL, AND, INTERSECT, EXCEPT, COLLATE, ASC, DESC, ON, USING, NATURAL, INNER, CROSS, LEFT, RIGHT, OUTER, FULL, JOIN, AS, INDEXED, NOT, SELECT, DISTINCT, FROM, WHERE, GROUP, BY, HAVING, ORDER, BY, LIMIT, OFFSET, OR, CAST, ISNULL, NOTNULL, NULL, IS, BETWEEN, ELSE, END, CASE, WHEN, THEN, EXISTS, COLLATE, IN, LIKE, GLOB, REGEXP, MATCH, ESCAPE, CURRENT_TIME, CURRENT_DATE, CURRENT_TIMESTAMP, WITH, EXTRACT, PARTITION, ROWS, RANGE, UNBOUNDED, PRECEDING, CURRENT, ROW, FOLLOWING, OVER, INTERVAL, DATE_ADD, DATE_SUB, ADDDATE, SUBDATE, REGEXP_EXTRACT, SPLIT, ORDINAL, FIRST_VALUE, LAST_VALUE, NTH_VALUE, LEAD, LAG, PERCENTILE_CONT, PRECENTILE_DISC, RANK, DENSE_RANK, PERCENT_RANK, CUME_DIST, NTILE, ROW_NUMBER, DATE, TIME, DATETIME, TIMESTAMP, UNNEST, INT64, NUMERIC, FLOAT64, BOOL, BYTES, GEOGRAPHY, ARRAY, STRUCT, SAFE_CAST, ANY_VALUE, ARRAY_AGG, ARRAY_CONCAT_AGG, AVG, BIT_AND, BIT_OR, BIT_XOR, COUNT, COUNTIF, LOGICAL_AND, LOGICAL_OR, MAX, MIN, STRING_AGG, SUM, CORR, COVAR_POP, COVAR_SAMP, STDDEV_POP, STDDEV_SAMP, STDDEV, VAR_POP, VAR_SAMP, VARIANCE, TIMESTAMP_ADD, TIMESTAMP_SUB, GENERATE_ARRAY, GENERATE_DATE_ARRAY, GENERATE_TIMESTAMP_ARRAY, FOR, SYSTEMTIME, AS, OF, WINDOW, RESPECT, IGNORE, NULLS """.replace(",", "").split(), ) keyword_nonfunctions = MatchFirst(( UNION, ALL, INTERSECT, EXCEPT, COLLATE, ASC, DESC, ON, USING, NATURAL, INNER, CROSS, LEFT, RIGHT, OUTER, FULL, JOIN, AS, INDEXED, NOT, SELECT, DISTINCT, FROM, WHERE, GROUP, BY, HAVING, ORDER, BY, LIMIT, OFFSET, CAST, ISNULL, NOTNULL, NULL, IS, BETWEEN, ELSE, END, CASE, WHEN, THEN, EXISTS, COLLATE, IN, LIKE, GLOB, REGEXP, MATCH, STRUCT, WINDOW, )) keyword = keyword_nonfunctions | MatchFirst(( ESCAPE, CURRENT_TIME, CURRENT_DATE, CURRENT_TIMESTAMP, DATE_ADD, DATE_SUB, ADDDATE, SUBDATE, INTERVAL, STRING_AGG, REGEXP_EXTRACT, SPLIT, ORDINAL, UNNEST, SAFE_CAST, PARTITION, TIMESTAMP_ADD, TIMESTAMP_SUB, ARRAY, GENERATE_ARRAY, GENERATE_DATE_ARRAY, GENERATE_TIMESTAMP_ARRAY, )) identifier_word = Word(alphas + "_@#", alphanums + "@$#_") identifier = ~keyword + identifier_word.copy() collation_name = identifier.copy() # NOTE: Column names can be keywords. Doc says they cannot, but in practice it seems to work. column_name = identifier.copy() cast_to = identifier.copy() qualified_column_name = Group( delimitedList(column_name, delim=".") + Optional( Suppress("::") + delimitedList(cast_to("cast"), delim="::"))) # NOTE: As with column names, column aliases can be keywords, e.g. functions like `current_time`. Other # keywords, e.g. `from` make parsing pretty difficult (e.g. "SELECT a from from b" is confusing.) column_alias = ~keyword_nonfunctions + column_name.copy() table_name = identifier.copy() table_alias = identifier.copy() index_name = identifier.copy() function_name = identifier.copy() parameter_name = identifier.copy() # NOTE: The expression in a CASE statement can be an integer. E.g. this is valid SQL: # select CASE 1 WHEN 1 THEN -1 ELSE -2 END from test_table unquoted_case_identifier = ~keyword + Word(alphanums + "$_") quoted_case_identifier = ~keyword + (QuotedString('"') ^ Suppress("`") + CharsNotIn("`") + Suppress("`")) case_identifier = quoted_case_identifier | unquoted_case_identifier case_expr = (Optional(case_identifier + Suppress(".")) + Optional(case_identifier + Suppress(".")) + case_identifier) # expression expr = Forward().setName("expression") integer = Regex(r"[+-]?\d+") numeric_literal = Regex(r"[+-]?\d*\.?\d+([eE][+-]?\d+)?") string_literal = QuotedString("'") | QuotedString('"') | QuotedString( "`") regex_literal = "r" + string_literal blob_literal = Regex(r"[xX]'[0-9A-Fa-f]+'") date_or_time_literal = (DATE | TIME | DATETIME | TIMESTAMP) + string_literal literal_value = ( numeric_literal | string_literal | regex_literal | blob_literal | date_or_time_literal | NULL | CURRENT_TIME + Optional(LPAR + Optional(string_literal) + RPAR) | CURRENT_DATE + Optional(LPAR + Optional(string_literal) + RPAR) | CURRENT_TIMESTAMP + Optional(LPAR + Optional(string_literal) + RPAR)) bind_parameter = Word("?", nums) | Combine(oneOf(": @ $") + parameter_name) type_name = oneOf( """TEXT REAL INTEGER BLOB NULL TIMESTAMP STRING DATE INT64 NUMERIC FLOAT64 BOOL BYTES DATETIME GEOGRAPHY TIME ARRAY STRUCT""", caseless=True, ) date_part = oneOf( """DAY DAY_HOUR DAY_MICROSECOND DAY_MINUTE DAY_SECOND HOUR HOUR_MICROSECOND HOUR_MINUTE HOUR_SECOND MICROSECOND MINUTE MINUTE_MICROSECOND MINUTE_SECOND MONTH QUARTER SECOND SECOND_MICROSECOND WEEK YEAR YEAR_MONTH""", caseless=True, ) datetime_operators = (DATE_ADD | DATE_SUB | ADDDATE | SUBDATE | TIMESTAMP_ADD | TIMESTAMP_SUB) def invalid_date_add(s, loc, tokens): prev_newline = s[:loc].rfind('\n') prev_prev_newline = s[:prev_newline].rfind('\n') if '--ignore' in s[prev_prev_newline:prev_newline]: pass else: raise RuntimeError( "{} is not valid, did you mean 'date_add'".format( tokens[0])) #bad_datetime_operators = ( # CaselessKeyword('dateadd').setParseAction(invalid_date_add) #) grouping_term = expr.copy() ordering_term = Group( expr("order_key") + Optional(COLLATE + collation_name("collate")) + Optional(ASC | DESC)("direction"))("ordering_term") function_arg = expr.copy()("function_arg") function_args = Optional( "*" | Optional(DISTINCT) + delimitedList(function_arg) + Optional((RESPECT | IGNORE) + NULLS))("function_args") function_call = ((function_name | keyword)("function_name") + LPAR + Group(function_args)("function_args_group") + RPAR)('function') navigation_function_name = (FIRST_VALUE | LAST_VALUE | NTH_VALUE | LEAD | LAG | PERCENTILE_CONT | PRECENTILE_DISC) aggregate_function_name = (ANY_VALUE | ARRAY_AGG | ARRAY_CONCAT_AGG | AVG | BIT_AND | BIT_OR | BIT_XOR | COUNT | COUNTIF | LOGICAL_AND | LOGICAL_OR | MAX | MIN | STRING_AGG | SUM) statistical_aggregate_function_name = (CORR | COVAR_POP | COVAR_SAMP | STDDEV_POP | STDDEV_SAMP | STDDEV | VAR_POP | VAR_SAMP | VARIANCE) numbering_function_name = (RANK | DENSE_RANK | PERCENT_RANK | CUME_DIST | NTILE | ROW_NUMBER) analytic_function_name = ( navigation_function_name | aggregate_function_name | statistical_aggregate_function_name | numbering_function_name)("analytic_function_name") partition_expression_list = delimitedList(grouping_term)( "partition_expression_list") window_frame_boundary_start = (UNBOUNDED + PRECEDING | numeric_literal + (PRECEDING | FOLLOWING) | CURRENT + ROW) window_frame_boundary_end = (UNBOUNDED + FOLLOWING | numeric_literal + (PRECEDING | FOLLOWING) | CURRENT + ROW) window_frame_clause = (ROWS | RANGE) + ( ((UNBOUNDED + PRECEDING) | (numeric_literal + PRECEDING) | (CURRENT + ROW)) | (BETWEEN + window_frame_boundary_start + AND + window_frame_boundary_end)) window_name = identifier.copy()("window_name") window_specification = ( Optional(window_name) + Optional(PARTITION + BY + partition_expression_list) + Optional(ORDER + BY + delimitedList(ordering_term)) + Optional(window_frame_clause)("window_specification")) analytic_function = ( analytic_function_name + LPAR + function_args.setParseAction(debug) + RPAR + OVER + (window_name | LPAR + Optional(window_specification) ('window') + RPAR))("analytic_function") string_agg_term = (STRING_AGG + LPAR + Optional(DISTINCT)('has_distinct') + expr('string_agg_expr') + Optional(COMMA + string_literal('delimiter')) + Optional(ORDER + BY + expr + Optional(ASC | DESC) + Optional(LIMIT + integer)) + RPAR)("string_agg") array_literal = ( Optional(ARRAY + Optional(LT + delimitedList(type_name) + GT)) + LBRACKET + delimitedList(expr) + RBRACKET) interval = INTERVAL + expr + date_part array_generator = (GENERATE_ARRAY + LPAR + numeric_literal + COMMA + numeric_literal + COMMA + numeric_literal + RPAR) date_array_generator = ( (GENERATE_DATE_ARRAY | GENERATE_TIMESTAMP_ARRAY) + LPAR + expr("start_date") + COMMA + expr("end_date") + Optional(COMMA + interval) + RPAR) explicit_struct = ( STRUCT + Optional(LT + delimitedList(type_name) + GT) + LPAR + Optional(delimitedList(expr + Optional(AS + identifier))) + RPAR) case_when = WHEN + expr.copy()("when") case_then = THEN + expr.copy()("then") case_clauses = Group(ZeroOrMore(case_when + case_then)) case_else = ELSE + expr.copy()("_else") case_stmt = (CASE + Optional(case_expr.copy()) + case_clauses("case_clauses") + Optional(case_else) + END)("case") class SelectStatement(SemanticToken): def __init__(self, tokens): self.tokens = tokens def getName(self): return 'select' @classmethod def parse(cls, tokens): return SelectStatement(tokens) class Function(SemanticToken): def __init__(self, func, tokens): self.func = func self.tokens = tokens def getName(self): return 'function' @classmethod def parse(cls, tokens): method = tokens[0] args = tokens[2:-1] return Function(method, args) def __repr__(self): return "func:{}({})".format(self.func, self.tokens) class WindowFunction(Function): def __init__(self, func, tokens, func_args, partition_args, order_args, window_args): self.func = func self.tokens = tokens self.func_args = func_args self.partition_args = partition_args self.order_args = order_args self.window_args = window_args def getName(self): return 'window function' @classmethod def parse(cls, tokens): return WindowFunction(tokens.analytic_function_name, tokens, tokens.function_args, tokens.partition_expression_list, tokens.ordering_term, tokens.window_specification) def __repr__(self): return "window:{}({})over({}, {}, {})".format( self.func, self.func_args, self.partition_args, self.order_args, self.window_args) class CaseStatement(SemanticToken): def __init__(self, tokens, whens, _else): self.tokens = tokens self.whens = whens self._else = _else def getName(self): return 'case' @classmethod def parse_whens(self, tokens): whens = [] while len(tokens) > 0: _, when, _, then, *tokens = tokens whens.append({"when": when, "then": then}) return whens @classmethod def parse(cls, tokens): whens = tokens[1] _else = tokens[3] return CaseStatement(tokens, cls.parse_whens(whens), _else) def __repr__(self): return "<case statement ({}, {})>".format( len(self.whens), self._else) expr_term = ( (analytic_function)("analytic_function").setParseAction( WindowFunction.parse) | (CAST + LPAR + expr + AS + type_name + RPAR)("cast") | (SAFE_CAST + LPAR + expr + AS + type_name + RPAR)("safe_cast") | (Optional(EXISTS) + LPAR + ungrouped_select_stmt + RPAR)("subselect") | (literal_value)("literal") | (bind_parameter)("bind_parameter") | (EXTRACT + LPAR + expr + FROM + expr + RPAR)("extract") | case_stmt.setParseAction(CaseStatement.parse) | (datetime_operators + LPAR + expr + COMMA + interval + RPAR)("date_operation") #| (bad_datetime_operators + LPAR + expr + COMMA + interval + RPAR) | string_agg_term("string_agg_term") | array_literal("array_literal") | array_generator("array_generator") | date_array_generator("date_array_generator") | explicit_struct("explicit_struct") | function_call("function_call").setParseAction(Function.parse) | qualified_column_name("column").setParseAction( lambda x: ".".join([str(i) for i in x[0]])) ).setParseAction(debug) + Optional(LBRACKET + (OFFSET | ORDINAL) + LPAR + expr + RPAR + RBRACKET)("offset_ordinal") struct_term = (LPAR + delimitedList(expr_term) + RPAR) KNOWN_OPS = [(BETWEEN, AND), Literal("||").setName("concat"), Literal("*").setName("mul"), Literal("/").setName("div"), Literal("+").setName("add"), Literal("-").setName("sub"), Literal("<>").setName("neq"), Literal(">").setName("gt"), Literal("<").setName("lt"), Literal(">=").setName("gte"), Literal("<=").setName("lte"), Literal("=").setName("eq"), Literal("==").setName("eq"), Literal("!=").setName("neq"), IN.setName("in"), IS.setName("is"), LIKE.setName("like"), OR.setName("or"), AND.setName("and"), NOT.setName('not')] class Operator(SemanticToken): def __init__(self, op, assoc, name, tokens): self.op = op self.assoc = assoc self.name = name self.tokens = tokens def getName(self): return 'operator' @classmethod def parse(cls, tokens): # ARRANGE INTO {op: params} FORMAT toks = tokens[0] if toks[1] in KNOWN_OPS: op = KNOWN_OPS[KNOWN_OPS.index(toks[1])] if toks.subselect: import ipdb ipdb.set_trace() return Operator(op, 'binary', op.name, [toks[0], toks[2:]]) else: import ipdb ipdb.set_trace() return tokens @classmethod def parse_unary(cls, tokens): toks = tokens[0] if toks[0] in KNOWN_OPS: op = KNOWN_OPS[KNOWN_OPS.index(toks[0])] else: import ipdb ipdb.set_trace() return Operator(op, 'unary', op.name, [toks[1:]]) @classmethod def parse_ternary(cls, tokens): import ipdb ipdb.set_trace() def __repr__(self): return "<operator({}, {}, {})>".format(self.op, self.assoc, self.tokens) UNARY, BINARY, TERNARY = 1, 2, 3 expr << infixNotation( (expr_term | struct_term), [ (oneOf("- + ~") | NOT, UNARY, opAssoc.RIGHT, Operator.parse_unary), (ISNULL | NOTNULL | NOT + NULL, UNARY, opAssoc.LEFT, Operator.parse_unary), ("||", BINARY, opAssoc.LEFT, Operator.parse), (oneOf("* / %"), BINARY, opAssoc.LEFT, Operator.parse), (oneOf("+ -"), BINARY, opAssoc.LEFT, Operator.parse), (oneOf("<< >> & |"), BINARY, opAssoc.LEFT, Operator.parse), (oneOf("= > < >= <= <> != !< !>"), BINARY, opAssoc.LEFT, Operator.parse), (IS + Optional(NOT) | Optional(NOT) + IN | Optional(NOT) + LIKE | GLOB | MATCH | REGEXP, BINARY, opAssoc.LEFT, Operator.parse), ((BETWEEN, AND), TERNARY, opAssoc.LEFT, Operator.parse_ternary), (Optional(NOT) + IN + LPAR + Group(ungrouped_select_stmt | delimitedList(expr)) + RPAR, UNARY, opAssoc.LEFT, Operator.parse_unary), (AND, BINARY, opAssoc.LEFT, Operator.parse), (OR, BINARY, opAssoc.LEFT, Operator.parse), ], lpar=Literal('('), rpar=Literal(')'), ) quoted_expr = (expr ^ Suppress('"') + expr + Suppress('"') ^ Suppress("'") + expr + Suppress("'") ^ Suppress("`") + expr + Suppress("`"))("quoted_expr") compound_operator = (UNION + Optional(ALL | DISTINCT) | INTERSECT + DISTINCT | EXCEPT + DISTINCT | INTERSECT | EXCEPT)("compound_operator") join_constraint = Group( Optional(ON + expr | USING + LPAR + Group(delimitedList(qualified_column_name)) + RPAR))("join_constraint") join_op = (COMMA | Group( Optional(NATURAL) + Optional(INNER | CROSS | LEFT + OUTER | LEFT | RIGHT + OUTER | RIGHT | FULL + OUTER | OUTER | FULL) + JOIN))("join_op") join_source = Forward() # We support three kinds of table identifiers. # # First, dot delimited info like project.dataset.table, where # each component follows the rules described in the BigQuery # docs, namely: # Contain letters (upper or lower case), numbers, and underscores # # Second, a dot delimited quoted string. Since it's quoted, we'll be # liberal w.r.t. what characters we allow. E.g.: # `project.dataset.name-with-dashes` # # Third, a series of quoted strings, delimited by dots, e.g.: # `project`.`dataset`.`name-with-dashes` # # We won't attempt to support combinations, like: # project.dataset.`name-with-dashes` # `project`.`dataset.name-with-dashes` def record_table_identifier(t): identifier_list = t.asList() padded_list = [None] * (3 - len(identifier_list)) + identifier_list cls._table_identifiers.add(tuple(padded_list)) standard_table_part = ~keyword + Word(alphanums + "_") standard_table_identifier = ( Optional(standard_table_part("project") + Suppress(".")) + Optional(standard_table_part("dataset") + Suppress(".")) + standard_table_part("table") ).setParseAction(lambda t: record_table_identifier(t)) quoted_project_part = ( Suppress('"') + CharsNotIn('"') + Suppress('"') | Suppress("'") + CharsNotIn("'") + Suppress("'") | Suppress("`") + CharsNotIn("`") + Suppress("`")) quoted_table_part = (Suppress('"') + CharsNotIn('".') + Suppress('"') | Suppress("'") + CharsNotIn("'.") + Suppress("'") | Suppress("`") + CharsNotIn("`.") + Suppress("`")) quoted_table_parts_identifier = ( Optional(quoted_project_part("project") + Suppress(".")) + Optional(quoted_table_part("dataset") + Suppress(".")) + quoted_table_part("table") ).setParseAction(lambda t: record_table_identifier(t)) def record_quoted_table_identifier(t): identifier_list = t.asList()[0].split(".") first = ".".join(identifier_list[0:-2]) or None second = identifier_list[-2] third = identifier_list[-1] identifier_list = [first, second, third] padded_list = [None] * (3 - len(identifier_list)) + identifier_list cls._table_identifiers.add(tuple(padded_list)) quotable_table_parts_identifier = ( Suppress('"') + CharsNotIn('"') + Suppress('"') | Suppress("'") + CharsNotIn("'") + Suppress("'") | Suppress("`") + CharsNotIn("`") + Suppress("`") ).setParseAction(lambda t: record_quoted_table_identifier(t)) table_identifier = (standard_table_identifier | quoted_table_parts_identifier | quotable_table_parts_identifier) def record_ref(t): lol = [t.op] + t.ref_target.asList() cls._with_aliases.add(tuple(lol)) cls._table_identifiers.add(tuple(lol)) ref_target = identifier.copy() single_source = ( # ref + source statements ((Suppress('{{') + (CaselessKeyword('ref') | CaselessKeyword("source"))("op") + LPAR + delimitedList((Suppress("'") | Suppress('"')) + ref_target + (Suppress("'") | Suppress('"')))("ref_target") + RPAR + Suppress("}}")).setParseAction(record_ref) | table_identifier) + Optional(Optional(AS) + table_alias("table_alias*")) + Optional(FOR + SYSTEMTIME + AS + OF + string_literal) + Optional(INDEXED + BY + index_name("name") | NOT + INDEXED) ("index") | (LPAR + ungrouped_select_stmt + RPAR + Optional(Optional(AS) + table_alias))('subquery') | (LPAR + join_source + RPAR) | (UNNEST + LPAR + expr + RPAR) + Optional(Optional(AS) + column_alias)) join_source << (Group(single_source + OneOrMore( Group(join_op + single_source + join_constraint)('joins*'))) | single_source)('sources*') over_partition = ( PARTITION + BY + delimitedList(partition_expression_list))("over_partition") over_order = ORDER + BY + delimitedList(ordering_term) over_unsigned_value_specification = expr over_window_frame_preceding = ( UNBOUNDED + PRECEDING | over_unsigned_value_specification + PRECEDING | CURRENT + ROW) over_window_frame_following = ( UNBOUNDED + FOLLOWING | over_unsigned_value_specification + FOLLOWING | CURRENT + ROW) over_window_frame_bound = (over_window_frame_preceding | over_window_frame_following) over_window_frame_between = (BETWEEN + over_window_frame_bound + AND + over_window_frame_bound) over_window_frame_extent = (over_window_frame_preceding | over_window_frame_between) over_row_or_range = (ROWS | RANGE) + over_window_frame_extent over = (OVER + LPAR + Optional(over_partition) + Optional(over_order) + Optional(over_row_or_range) + RPAR)("over") result_column = ( Optional(table_name + ".") + "*" + Optional(EXCEPT + LPAR + delimitedList(column_name) + RPAR) | Group(quoted_expr + Optional(over) + Optional(Optional(AS) + column_alias('alias')))) window_select_clause = (WINDOW + identifier + AS + LPAR + window_specification + RPAR) select_core = ( SELECT + Optional(DISTINCT | ALL) + Group(delimitedList(result_column))("columns") + Optional(FROM - join_source("from*")) + Optional(WHERE + expr('where')) + Optional(GROUP + BY + Group(delimitedList(grouping_term))("group_by_terms")) + Optional(HAVING + expr("having_expr")) + Optional(ORDER + BY + Group(delimitedList(ordering_term))("order_by_terms")) + Optional(delimitedList(window_select_clause))) grouped_select_core = select_core | (LPAR + select_core + RPAR) ungrouped_select_stmt << ( grouped_select_core + ZeroOrMore(compound_operator + grouped_select_core) + Optional(LIMIT + (Group(expr + OFFSET + expr) | Group(expr + COMMA + expr) | expr)("limit")))("select") select_stmt = ungrouped_select_stmt | (LPAR + ungrouped_select_stmt + RPAR) # define comment format, and ignore them sql_comment = oneOf("-- #") + restOfLine | cStyleComment select_stmt.ignore(sql_comment) def record_with_alias(t): identifier_list = t.asList() padded_list = [None] * (3 - len(identifier_list)) + identifier_list cls._with_aliases.add(tuple(padded_list)) with_stmt = Forward().setName("with statement") with_clause = Group( identifier.setParseAction(lambda t: record_with_alias(t)) ('cte_name') - AS - LPAR + (select_stmt | with_stmt) - RPAR) with_core = WITH + delimitedList(with_clause)('ctes') with_stmt << (with_core - ~Literal(',') + ungrouped_select_stmt) with_stmt.ignore(sql_comment) select_or_with = select_stmt | with_stmt select_or_with_parens = LPAR + select_or_with - RPAR cls._parser = select_or_with | select_or_with_parens return cls._parser
def build(self): # ------------------------------------------ # C. building blocks # ------------------------------------------ self.termop = Regex( "|".join(self.neighbourhood_symbols), re.IGNORECASE).setParseAction(upcaseTokens).setName("termop") termword = Word(self.unicode_printables + self.separators + self.wildcards).setName("term") termword_termop = (termword + OneOrMore(self.termop + termword)) # ------------------------------------------ # D. triple # ------------------------------------------ index = Word(alphanums).setName("index") #index = Word(indexchars).setName("index") #SolrProximitySuffix = Suppress(Optional(Word('~') + Word(nums))) binop = oneOf(self.binop_symbols, caseless=True).setName("binop") term = ( # Attempt to parse {!complexphrase}text:"((aussto* OR eject* OR pusher*) AND (verriegel* OR lock* OR sperr*))"~6 ... # ... but failed. #Combine(quotedString.setParseAction(removeQuotes) + SolrProximitySuffix).setName("term") ^ # term is a quoted string, easy peasy quotedString.setName("term") ^ # term is just a termword, easy too termword.setName("term") ^ # term contains neighbourhood operators, so should have been wrapped in parenthesis Combine('(' + Suppress(ZeroOrMore(' ')) + termword_termop + Suppress(ZeroOrMore(' ')) + ')').setName("term") ^ # convenience/gracefulness: we also allow terms containing # neighbourhood operators without being wrapped in parenthesis Combine(termword_termop).setName("term")) # ------------------------------------------ # E. condition # ------------------------------------------ cqlStatement = Forward() # Parse regular cql condition notation 'index=term'. cqlConditionBase = Group( # a regular triple (index + binop + term).setResultsName("triple") | # a regular subquery ("(" + cqlStatement + ")").setResultsName("subquery")) # Parse value shortcut notations like 'index=(term)' or 'index=(term1 and term2 or term3)'. cqlConditionShortcut = Group( # a triple in value shortcut notation (contains only the single term) # "term + NotAny(binop)" helps giving proper error messages like # "ParseException: Expected term (at char 4)" for erroneous queries like "foo=" (term + NotAny(binop)).setResultsName("triple-short") | # a subquery containing values in shortcut notation (index + binop + "(" + cqlStatement + ")").setResultsName("subquery-short")) #cqlCondition = cqlConditionBase cqlCondition = cqlConditionBase | cqlConditionShortcut # ------------------------------------------ # F. statement # ------------------------------------------ cqlStatement << cqlCondition + ZeroOrMore(self.booleans_or + cqlStatement) # apply SQL comment format cqlComment = "--" + restOfLine cqlStatement.ignore(cqlComment) self.parser = cqlStatement
term << operatorPrecedence( number | predicate | variable, [ (oneOf("+ -"), 1, opAssoc.RIGHT, FOLUnOp), (oneOf("^"), 2, opAssoc.LEFT, FOLBinOp), (oneOf("* /"), 2, opAssoc.LEFT, FOLBinOp), (oneOf("+ -"), 2, opAssoc.LEFT, FOLBinOp), (oneOf("< <= > >= "), 2, opAssoc.LEFT, FOLBinOp), ], ) # main parser for FOL formula formula = Forward() formula.ignore(comment) forall_expression = Group( forall.setResultsName("quantifier") + delimitedList(variable).setResultsName("vars") + colon + formula.setResultsName("args") ).setParseAction(FOLQuant) exists_expression = Group( exists.setResultsName("quantifier") + delimitedList(variable).setResultsName("vars") + colon + formula.setResultsName("args") ).setParseAction(FOLQuant) operand = forall_expression | exists_expression | boolean | term
def performIPOperatorSanityCheck(componentName, propagationDimension, operatorCodeSlice, codeBlock): """ Check that the user hasn't tried to use an IP operator where an IP operator cannot be used. IP operators must be diagonal, so one cannot have expressions of the form ``dy_dt = L[x];`` for IP operators. This is valid for EX operators, but not for IP. This is a common mistake for users to make, and so we should do our best to spot it and report the error. Another mistake users make is trying to multiply the operator, for example ``dy_dt = i*L[y];``. This code does a sophisticated validation by constructing a parse tree for each statement in the code taking into account operator precedence. This sanity checking is even able to pick up problems such as ``dphi_dt = i*(V*phi + U*mod2(phi)*phi + T[phi]);``. If the user's code passes this test, then it is a reasonable assumption that they are using IP operators safely. """ operatorString = codeBlock.codeString[operatorCodeSlice] expr = Forward() operatorKeyword = Keyword(operatorString).setResultsName('targetOperator') operand = operatorKeyword \ | (identifier + Group('(' + delimitedList(expr) + ')')) \ | (identifier + Group(OneOrMore('[' + expr + ']'))) \ | quotedString.copy() \ | identifier \ | numericConstant operand.ignore(cppStyleComment.copy()) expr << operatorPrecedence( operand, [ (oneOf('++ --'), 1, opAssoc.LEFT), (oneOf('. ->'), 2, opAssoc.LEFT), (~oneOf('-> -= += *= &= |=') + oneOf('+ - ! ~ * & ++ --'), 1, opAssoc.RIGHT), (~oneOf('*= /= %=') + oneOf('* / %'), 2, opAssoc.LEFT), (~oneOf('++ -- -> -= +=') + oneOf('+ -'), 2, opAssoc.LEFT), # Although the operators below don't all have the same precedence, as we don't actually # care about them as they are all invalid uses of the IP operator, we can cheat and lump # them together (~oneOf('<<= >>= &= |=') + oneOf('<< >> < <= > >= == != & ^ | && ||'), 2, opAssoc.LEFT), # Correct ordering # (~oneOf('<<= >>=') + oneOf('<< >>'), 2, opAssoc.LEFT), # (~oneOf('<< >> <<= >>=') + oneOf('< <= > >='), 2, opAssoc.LEFT), # (oneOf('== !='), 2, opAssoc.LEFT), # (~oneOf('&& &=') + '&', 2, opAssoc.LEFT), # ('^', 2, opAssoc.LEFT), # (~oneOf('|| |=') + '|', 2, opAssoc.LEFT), # ('&&', 2, opAssoc.LEFT), # ('||', 2, opAssoc.LEFT), (('?',':'), 3, opAssoc.RIGHT), (~Literal('==') + oneOf('= += -= *= /= %= <<= >>= &= ^= |= =>'), 2, opAssoc.RIGHT), (',', 2, opAssoc.LEFT), ] ) expr.ignore(cppStyleComment.copy()) statement = expr + Suppress(';') stack = [] expectedAssignmentVariable = 'd%(componentName)s_d%(propagationDimension)s' % locals() def validateStack(): """ It is the job of this function to validate the operations that the located operator is involved in. The stack describes the part of the parse tree in which the operator was found. The first element in the stack is the outermost operation, and the last the innermost. The last element is guaranteed to be the operator itself. """ # Reverse the stack as we want to search the parse tree from inner-most expression to outer-most. stack.reverse() assignmentHit = False errorMessageCommon = "Due to the way IP operators work, they can only contribute to the derivative of the variable " \ "they act on, i.e. dx_dt = L[x]; not dy_dt = L[x];\n\n" # We don't need to check the first element of the stack # as we are guaranteed that it is the operator itself. This will be useful for determining # which part of the parse tree we should be looking at. for idx, node in enumerate(stack[1:]): if len(node) == 1: continue # idx is the index in the stack of the next element *deeper* in the parse tree. previousStackEntry = stack[idx] if not isinstance(stack[idx], basestring): previousStackEntry = previousStackEntry.asList() binaryOpIdx = node.asList().index(previousStackEntry) - 1 if binaryOpIdx < 0: binaryOpIdx = 1 # Unary '+' is safe. if node[0] == '+': continue # Binary '+' is safe. if node[binaryOpIdx] == '+': continue # Binary '-' is safe if the operator is the first argument. if node[binaryOpIdx] == '-' and node.asList().index(previousStackEntry) == 0: continue # Assignment is safe if it there is only one, and if it's to the right variable if node[binaryOpIdx] in ['=', '+=']: if node[0] == expectedAssignmentVariable: assignmentHit = True continue else: return errorMessageCommon + "In this case, you should probably use an EX operator instead of an "\ "IP operator." else: return errorMessageCommon + "You appear to be using the IP operator in an unsafe operation. " \ "The most likely cause is trying to multiply it by something, e.g. dphi_dt = 0.5*L[phi]; "\ "If this is the cause and you are multiplying by a constant, just move the constant into the "\ "definition of the operator itself. i.e. L = -0.5*kx*kx; If you are multiplying by something "\ "that isn't constant e.g. dphi_dt = x*L[phi]; where x is a dimension, you must use an EX operator "\ "instead." if not assignmentHit: return errorMessageCommon + "You appear to be missing the assignment for this particular operator." return True class FoundTargetException(Exception): pass def findOperatorInParseTree(results): stack.append(results) if 'targetOperator' in results: stack.append(results.targetOperator) raise FoundTargetException() for item in results: if isinstance(item, basestring): continue findOperatorInParseTree(item) del stack[-1] try: foundOperator = False for tokens, start, end in statement.scanString(codeBlock.codeString): if start > operatorCodeSlice.stop or end < operatorCodeSlice.start: continue try: findOperatorInParseTree(tokens) except FoundTargetException: foundOperator = True result = validateStack() if result is not True: raise CodeParserException( codeBlock, operatorCodeSlice.start, result + ("\n\nThe conflict was caused by the operator '%s'." \ % operatorString) ) if not foundOperator: parserWarning( codeBlock.xmlElement, "Unable to check the safety of your IP operator '%s' because the containing expression could not be found. " "Please send a copy of your script to [email protected] so this problem can be investigated." \ % operatorString ) except RuntimeError: parserWarning( codeBlock.xmlElement, "Unable to check the safety of your IP operator because your code is too deeply nested." )
identifier = Word(alphas + '_', alphanums + '_') numericConstant = Regex(r'\b((0(x|X)[0-9a-fA-F]*)|(([0-9]+\.?[0-9]*)|(\.[0-9]+))((e|E)(\+|-)?[0-9]+)?)(L|l|UL|ul|u|U|F|f|ll|LL|ull|ULL)?\b') ignoreExpr = cppStyleComment.copy() | quotedString.copy() baseExpr = Forward() arrayAccess = originalTextFor(nestedExpr('[', ']', baseExpr, ignoreExpr)) parenthisedExpression = originalTextFor(nestedExpr('(', ')', baseExpr, ignoreExpr)) functionCall = nestedExpr('(', ')', delimitedList(baseExpr), ignoreExpr) alphaNumPlusSafePunctuation = alphanums + '!#$%&\\*+-./:;<=>@^_`{|}~' baseExpr << OneOrMore(originalTextFor(identifier + functionCall) | quotedString.copy() \ | identifier | numericConstant | arrayAccess | parenthisedExpression \ | Word(alphaNumPlusSafePunctuation)) baseExpr.ignore(cppStyleComment.copy()) def targetComponentsForOperatorsInString(operatorNames, codeBlock): """ Return a list of pairs of operator names and their targets that are in `codeString`. The valid operator names searched for are `operatorNames`. For example, if 'L' is in `operatorNames`, then in the code ``L[phi]`` the return value would be ``('L', 'phi', slice(firstCharacterIndex, lastCharacterIndex))``. """ parser = MatchFirst(Keyword(operatorName) for operatorName in operatorNames).setResultsName('name') \ + Optional(nestedExpr('[', ']', baseExpr, ignoreExpr).setResultsName('target')) parser.ignore(cppStyleComment.copy()) parser.ignore(quotedString.copy()) results = [] for tokens, start, end in parser.scanString(codeBlock.codeString): if 'target' in tokens:
from pyparsing import (Regex, OneOrMore, Forward, delimitedList, restOfLine, Group as Grp, Suppress) from musicobject import Tone, Group, Transformed musicobject = Forward() comment = '#' + restOfLine musicobject.ignore(comment) #fraction = Regex(r'(\d*[./]?\d*)') number = Regex(r'[\d./]+') number.setParseAction(lambda s, l, t: [float(eval(t[0]))]) frequency_symbol = Regex(r'[abcdefg_]\d?[#-]?') frequency_number = number frequency = frequency_number ^ frequency_symbol duration = number tone = frequency ^ (Suppress('(') + frequency + Suppress(',') + duration + Suppress(')')) tone.setParseAction(lambda s, l, t: Tone(*t)) group = Suppress('{') + delimitedList(Grp(OneOrMore(musicobject)), ',') + Suppress('}') group.setParseAction(lambda s, l, t: Group(t)) transformed = tone + '*' + musicobject transformed.setParseAction(lambda s, l, t: Transformed(t[0], t[2])) musicobject << (tone ^ group ^ transformed)
def getParseTree(self, PassedString): """This method is responsible for parsing the given string and returning a parse tree to the caller.""" # Uhh...? copyStatement = Forward() # Define literals. colon = Literal(':').suppress() leftBracket = Literal('[').suppress() rightBracket = Literal(']').suppress() quote = Literal('"').suppress() copyKeyword = Keyword("copy", caseless=True) nowKeyword = Keyword("now", caseless=True) onKeyword = Keyword("on", caseless=True) reverseKeyword = Keyword("reverse", caseless=True) singleKeyword = Keyword("single", caseless=True) threadsKeyword = Keyword("threads", caseless=True) whereKeyword = Keyword("where", caseless=True) # Basic server group and attribute parsing logic. identifier = Word(alphas + '/', alphanums + '_').setName("identifier") filename = Word(string.letters + string.punctuation).setName("filename") attribStr = delimitedList(identifier) attribStrList = Group(attribStr) groupStr = identifier + ZeroOrMore( leftBracket + attribStrList.setResultsName("attribs") + rightBracket) groupStrList = Group(delimitedList(groupStr)) localfileStr = filename remotefileStr = filename # Extended server group and attribute parsing logic. whereExpression = Forward() and_ = Keyword("and", caseless=True) or_ = Keyword("or", caseless=True) binaryOpStr = oneOf("= != < > >= <= eq ne lt le gt ge", caseless=True) integerStr = Word(nums) rightValue = integerStr | quotedString whereCondition = Group(identifier + binaryOpStr + rightValue) whereExpression << whereCondition + ZeroOrMore( (and_ | or_) + whereExpression) # define the grammar copyStatement << ( copyKeyword + localfileStr.setResultsName("local_filename") + Optional(groupStrList.setResultsName("groups") + colon) + remotefileStr.setResultsName("remote_filename") + Optional(Group(whereKeyword + whereExpression), "").setResultsName("where") + Optional(nowKeyword) + Optional(reverseKeyword) + Optional(singleKeyword) + Optional(threadsKeyword)) copyStatement.ignore(pythonStyleComment) CopyParser = copyStatement myTokens = CopyParser.parseString(PassedString) print "Output: tokens = ", myTokens print "Output: tokens.attribs =", myTokens.attribs print "Output: tokens.local_filename =", myTokens.local_filename print "Output: tokens.groups =", myTokens.groups print "Output: tokens.remote_filename = ", myTokens.remote_filename print "Output: tokens.where =", myTokens.where return myTokens
index_kw = Keyword('index', caseless=True) _index_stmt = Forward() _index_stmt << (Optional(create_kw) + index_kw + index_source.setResultsName('source') + '(' + column_name_list.setResultsName('columns') + ')') # Examples: # index = index_stmt.parseString('INDEX partition1 (col1, col2, col3);') # print(index.source) # 'partition1' # print(index.columns) # ['col1', 'col2', 'col3'] # define Oracle comment format, and ignore them oracle_sql_comment = '--' + restOfLine _view_stmt.ignore(oracle_sql_comment) _index_stmt.ignore(oracle_sql_comment) def substitute_vids(library, statement): """ Replace all of the references to tables and partitions with their vids. This is a bit of a hack -- it ought to work with the parser, but instead it just looks for common SQL tokens that indicate an identifier. :param statement: an sqlstatement. String. :return: tuple: new_statement, set of table vids, set of partition vids. """ from ambry.identity import ObjectNumber, TableNumber, NotObjectNumberError from ambry.orm.exc import NotFoundError
def parser(text): cvtTuple = lambda toks: tuple(toks.asList()) cvtRaw = lambda toks: RawString(' '.join(map(str, toks.asList()))) #cvtDict = lambda toks: dict(toks.asList()) cvtGlobDict = lambda toks: GlobDict(toks.asList()) cvtDict = cvtGlobDict extractText = lambda s, l, t: RawString(s[t._original_start:t._original_end]) def pythonize(toks): s = toks[0] if s == 'true': return True elif s == 'false': return False elif s == 'none': return [None] elif s.isdigit(): return int(s) elif re.match('(?i)^-?(\d+\.?e\d+|\d+\.\d*|\.\d+)$', s): return float(s) return toks[0] def noneDefault(s, loc, t): return t if len(t) else [RawEOL] # define punctuation as suppressed literals lbrace, rbrace = map(Suppress, "{}") identifier = Word(printables, excludeChars='{}"\'') quotedStr = QuotedString('"', escChar='\\', multiline=True) | \ QuotedString('\'', escChar='\\', multiline=True) quotedIdentifier = QuotedString('"', escChar='\\', unquoteResults=False) | \ QuotedString('\'', escChar='\\', unquoteResults=False) dictStr = Forward() setStr = Forward() objStr = Forward() #anyIdentifier = identifier | quotedIdentifier oddIdentifier = identifier + quotedIdentifier dictKey = dictStr | quotedStr | \ Combine(oddIdentifier).setParseAction(cvtRaw) dictKey.setParseAction(cvtRaw) dictValue = quotedStr | dictStr | setStr | \ Combine(oddIdentifier).setParseAction(cvtRaw) if OLD_STYLE_KEYS: dictKey |= Combine(identifier + ZeroOrMore(White(' ') + (identifier + ~FollowedBy(Optional(White(' ')) + LineEnd())))) dictValue |= identifier.setParseAction(pythonize) else: dictKey |= identifier dictValue |= delimitedList(identifier | quotedIdentifier, delim=White(' '), combine=True).setParseAction(pythonize) ParserElement.setDefaultWhitespaceChars(' \t') #dictEntry = Group(Combine(OneOrMore(identifier | quotedIdentifier)).setParseAction(cvtRaw) + dictEntry = Group(dictKey + Optional(White(' ').suppress() + dictValue).setParseAction(noneDefault) + Optional(White(' ').suppress()) + LineEnd().suppress()) #dictEntry = Group(SkipTo(dictKey + LineEnd() + dictKey)) dictStr << (lbrace + ZeroOrMore(dictEntry) + rbrace) dictStr.setParseAction(cvtDict) ParserElement.setDefaultWhitespaceChars(' \t\r\n') setEntry = identifier.setParseAction(pythonize) | quotedString.setParseAction(removeQuotes) setStr << (lbrace + delimitedList(setEntry, delim=White()) + rbrace) setStr.setParseAction(cvtTuple) # TODO: take other literals as arguments blobObj = Group(((Literal('ltm') + Literal('rule') + identifier) | \ (Literal('rule') + identifier)).setParseAction(cvtRaw) + originalTextFor(nestedExpr('{', '}')).setParseAction(extractText)) objEntry = Group(OneOrMore(identifier | quotedIdentifier).setParseAction(cvtRaw) + Optional(dictStr).setParseAction(noneDefault)) objStr << (Optional(delimitedList(blobObj | objEntry, delim=LineEnd()))) objStr.setParseAction(cvtGlobDict) #objStr.setParseAction(cvtTuple) objStr.ignore(pythonStyleComment) return objStr.parseString(text)[0]
_index_stmt << ( Optional(create_kw) + index_kw + index_source.setResultsName('source') + '(' + column_name_list.setResultsName('columns') + ')') # Examples: # index = index_stmt.parseString('INDEX partition1 (col1, col2, col3);') # print(index.source) # 'partition1' # print(index.columns) # ['col1', 'col2', 'col3'] # define Oracle comment format, and ignore them oracle_sql_comment = '--' + restOfLine _view_stmt.ignore(oracle_sql_comment) _index_stmt.ignore(oracle_sql_comment) def substitute_vids(library, statement): """ Replace all of the references to tables and partitions with their vids. This is a bit of a hack -- it ought to work with the parser, but instead it just looks for common SQL tokens that indicate an identifier. :param statement: an sqlstatement. String. :return: tuple: new_statement, set of table vids, set of partition vids. """ from ambry.identity import ObjectNumber, TableNumber, NotObjectNumberError from ambry.orm.exc import NotFoundError
def braces_parser(text, opener=BLOB_OPENER, closer=BLOB_CLOSER): cvtTuple = lambda toks: tuple(toks.asList()) # @IgnorePep8 cvtRaw = lambda toks: RawString(' '.join(map(str, toks.asList())) ) # @IgnorePep8 cvtDict = lambda toks: GlobDict(toks.asList()) # @IgnorePep8 extractText = lambda s, l, t: RawString(s[t._original_start:t._original_end ]) # @IgnorePep8 def pythonize(toks): s = toks[0] if s == 'true': return True elif s == 'false': return False elif s == 'none': return [None] elif s.isdigit(): return int(s) elif re.match('(?i)^-?(\d+\.?e\d+|\d+\.\d*|\.\d+)$', s): return float(s) return toks[0] def noneDefault(s, loc, t): return t if len(t) else [RawEOL] # define punctuation as suppressed literals lbrace, rbrace = map(Suppress, "{}") identifier = Word(printables, excludeChars='{}"\'') quotedStr = QuotedString('"', escChar='\\', multiline=True) | \ QuotedString('\'', escChar='\\', multiline=True) quotedIdentifier = QuotedString('"', escChar='\\', unquoteResults=False) | \ QuotedString('\'', escChar='\\', unquoteResults=False) dictStr = Forward() setStr = Forward() objStr = Forward() oddIdentifier = identifier + quotedIdentifier dictKey = quotedIdentifier | \ Combine(oddIdentifier).setParseAction(cvtRaw) dictKey.setParseAction(cvtRaw) dictValue = quotedStr | dictStr | setStr | \ Combine(oddIdentifier).setParseAction(cvtRaw) if OLD_STYLE_KEYS: dictKey |= Combine(identifier + ZeroOrMore( White(' ') + (identifier + ~FollowedBy(Optional(White(' ')) + LineEnd())))) dictValue |= identifier.setParseAction(pythonize) else: dictKey |= identifier dictValue |= Or([ delimitedList(identifier | quotedIdentifier, delim=White(' '), combine=True), Combine( delimitedList(identifier | quotedIdentifier, delim=White(' '), combine=True) + Optional( White(' ') + originalTextFor(nestedExpr('{', '}')). setParseAction(extractText))).setParseAction(cvtRaw) ]) ParserElement.setDefaultWhitespaceChars(' \t') dictEntry = Group(dictKey + Optional(White(' ').suppress() + dictValue).setParseAction(noneDefault) + Optional(White(' ').suppress()) + LineEnd().suppress()) dictStr << (lbrace + ZeroOrMore(dictEntry) + rbrace) dictStr.setParseAction(cvtDict) ParserElement.setDefaultWhitespaceChars(' \t\r\n') setEntry = identifier.setParseAction( pythonize) | quotedString.setParseAction(removeQuotes) | dictStr setStr << (lbrace + delimitedList(setEntry, delim=White()) + rbrace) setStr.setParseAction(cvtTuple) objEntry = dictStr.ignore(pythonStyleComment) objStr << delimitedList(objEntry, delim=LineEnd()) return objStr.parseString(text)[0]
ID = ~MatchFirst([Keyword(w) for w in _keywords]) + Regex(r"[a-zA-Z_][a-zA-Z0-9_$]*")("id") LP, RP, LB, RB, LC, RC, COLON, SEMICOLON, CAMMA, PERIOD, SHARP, EQUAL, AT, ASTA, Q, PLUS, MINUS, USC, APS = map( Suppress, ("()[]{}:;,.#=@*?+-_'") ) DSLASH = Suppress(Literal("//")) for k in _keywords: setattr(this_mod, k.swapcase(), Keyword(k)("keyword")) # setattr(sys.modules[__name__],k,Literal(k)) with open(_non_terminal_symbols_file, "r") as f: for name in (line.strip() for line in f): sym = Forward()(name) sym.enablePackrat() sym.ignore(cStyleComment) # print("sym={0}".format(name)) setattr(this_mod, name, sym) def alias(grammar, name): if name: return Group(grammar)(name) else: return Group(grammar) class ErrorReportException(ParseException): pass