def parse_block(self, block_text): """Parses sql block into tokens """ # Valid grammar looks like this: # {sqlbarchart: title='Some string' | other params as yet unknown...} # make a grammar block_start = Literal("{") sql_start = Keyword(self.TAGNAME, caseless=True) colon = Literal(":") sql_end = Literal("}") separator = Literal("|") block_end = Keyword("{" + self.TAGNAME + "}", caseless=True) # params field_name = Word(alphanums) equal_sign = Suppress(Literal("=")) # whatever value field_value = (CharsNotIn("|}")) # param name and value param_group = Group(field_name + equal_sign + field_value) # list of all params param_list = delimitedList(param_group, '|') # helper param_dict = Dict(param_list) # sql text sql_text = SkipTo(block_end) sqldecl = Forward() sqldecl << (block_start + sql_start + Optional(colon) + Optional(param_dict) + sql_end + sql_text.setResultsName('sqltext') + block_end) block_str = "".join(block_text) tokens = sqldecl.parseString( block_str ) return tokens
def parse_block(self, block_text): # make a grammar block_start = Literal("{") sql_start = Keyword("sqltable", caseless=True) colon = Literal(":") sql_end = Literal("}") separator = Literal("|") block_end = Keyword("{sqltable}", caseless=True) # params field_name = Word(alphanums) equal_sign = Suppress(Literal("=")) # whatever value field_value = (CharsNotIn("|}")) # param name and value param_group = Group(field_name + equal_sign + field_value) # list of all params param_list = delimitedList(param_group, '|') # helper param_dict = Dict(param_list) # sql text sql_text = SkipTo(block_end) sqldecl = Forward() sqldecl << (block_start + sql_start + Optional(colon) + Optional(param_dict) + sql_end + sql_text.setResultsName('sqltext') + block_end) block_str = "".join(block_text) tokens = sqldecl.parseString( block_str ) return tokens
class CreateParser(object): """ This class can take a plain "CREATE TABLE" SQL as input and parse it into a Table object, so that we have more insight on the detail of this SQL. Example: sql = 'create table foo ( bar int primary key )' parser = CreateParser(sql) try: tbl_obj = parser.parse() except ParseError: log.error("Failed to parse SQL") This set of BNF rules are basically translated from the MySQL manual: http://dev.mysql.com/doc/refman/5.6/en/create-table.html If you don't know how to change the rule or fix the bug, <Getting Started with Pyparsing> is probably the best book to start with. Also this wiki has all supported functions listed: https://pyparsing.wikispaces.com/HowToUsePyparsing If you want have more information how these characters are matching, add .setDebug(True) after the specific token you want to debug """ # Basic token WORD_CREATE = CaselessLiteral("CREATE").suppress() WORD_TABLE = CaselessLiteral("TABLE").suppress() COMMA = Literal(',').suppress() DOT = Literal('.') LEFT_PARENTHESES = Literal('(').suppress() RIGHT_PARENTHESES = Literal(')').suppress() QUOTE = Literal("'") | Literal('"') BACK_QUOTE = Optional(Literal('`')).suppress() LENGTH = Word(nums) OBJECT_NAME = Word(alphanums + "_" + "-" + "<" + ">" + ":") QUOTED_STRING_WITH_QUOTE = QuotedString( quoteChar="'", escQuote="''", escChar='\\', multiline=True, unquoteResults=False) | QuotedString(quoteChar='"', escQuote='""', escChar='\\', multiline=True, unquoteResults=False) QUOTED_STRING = QuotedString( quoteChar="'", escQuote="''", escChar='\\', multiline=True) | QuotedString( quoteChar='"', escQuote='""', escChar='\\', multiline=True) # Start of a create table statement # Sample: this part of rule will match following section # `table_name` IF NOT EXISTS IF_NOT_EXIST = Optional( CaselessLiteral("IF") + CaselessLiteral("NOT") + CaselessLiteral("EXISTS")).suppress() TABLE_NAME = (QuotedString( quoteChar="`", escQuote="``", escChar='\\', unquoteResults=True) | OBJECT_NAME)('table_name') # Column definition # Sample: this part of rule will match following section # `id` bigint(20) unsigned NOT NULL DEFAULT '0', COLUMN_NAME = (QuotedString( quoteChar="`", escQuote="``", escChar='\\', unquoteResults=True) | OBJECT_NAME)('column_name') COLUMN_NAME_WITH_QUOTE = (QuotedString( quoteChar="`", escQuote="``", escChar='\\', unquoteResults=False) | OBJECT_NAME)('column_name') UNSIGNED = Optional(CaselessLiteral("UNSIGNED"))('unsigned') ZEROFILL = Optional(CaselessLiteral("ZEROFILL"))('zerofill') COL_LEN = Combine(LEFT_PARENTHESES + LENGTH + RIGHT_PARENTHESES, adjacent=False)('length') INT_TYPE = (CaselessLiteral("TINYINT") | CaselessLiteral("SMALLINT") | CaselessLiteral("MEDIUMINT") | CaselessLiteral("INT") | CaselessLiteral("INTERGER") | CaselessLiteral("BIGINT") | CaselessLiteral("BINARY") | CaselessLiteral("BIT")) INT_DEF = (INT_TYPE('column_type') + Optional(COL_LEN) + UNSIGNED + ZEROFILL) VARBINARY_DEF = (CaselessLiteral('VARBINARY')('column_type') + COL_LEN) FLOAT_TYPE = \ CaselessLiteral("REAL") | CaselessLiteral("DOUBLE") |\ CaselessLiteral("FLOAT") | CaselessLiteral("DECIMAL") |\ CaselessLiteral("NUMERIC") FLOAT_LEN = Combine(LEFT_PARENTHESES + LENGTH + Optional(COMMA + LENGTH) + RIGHT_PARENTHESES, adjacent=False, joinString=', ')('length') FLOAT_DEF = (FLOAT_TYPE('column_type') + Optional(FLOAT_LEN) + UNSIGNED + ZEROFILL) # time type definition. They contain type_name and an optional FSP section # Sample: DATETIME[(fsp)] FSP = COL_LEN DT_DEF = ( Combine(CaselessLiteral("TIME") + Optional(CaselessLiteral("STAMP"))) | CaselessLiteral("DATETIME"))('column_type') + Optional(FSP) SIMPLE_DEF = (CaselessLiteral("DATE") | CaselessLiteral("YEAR") | CaselessLiteral("TINYBLOB") | CaselessLiteral("BLOB") | CaselessLiteral("MEDIUMBLOB") | CaselessLiteral("LONGBLOB") | CaselessLiteral("BOOL") | CaselessLiteral("BOOLEAN"))('column_type') OPTIONAL_COL_LEN = Optional(COL_LEN) BINARY = Optional(CaselessLiteral("BINARY"))('binary') CHARSET_NAME = (Optional(QUOTE).suppress() + Word(alphanums + '_')('charset') + Optional(QUOTE).suppress()) COLLATION_NAME = (Optional(QUOTE).suppress() + Word(alphanums + '_')('collate') + Optional(QUOTE).suppress()) CHARSET_DEF = Optional( CaselessLiteral("CHARACTER SET").suppress() + CHARSET_NAME) COLLATE_DEF = Optional( CaselessLiteral("COLLATE").suppress() + COLLATION_NAME) CHAR_DEF = (CaselessLiteral("CHAR")('column_type') + OPTIONAL_COL_LEN + BINARY + CHARSET_DEF + COLLATE_DEF) VARCHAR_DEF = (CaselessLiteral("VARCHAR")('column_type') + COL_LEN + BINARY + CHARSET_DEF + COLLATE_DEF) TEXT_TYPE = (CaselessLiteral("TINYTEXT") | CaselessLiteral("TEXT") | CaselessLiteral("MEDIUMTEXT") | CaselessLiteral("LONGTEXT") | CaselessLiteral("DOCUMENT")) TEXT_DEF = (TEXT_TYPE('column_type') + BINARY + CHARSET_DEF + COLLATE_DEF) ENUM_VALUE_LIST = Group(QUOTED_STRING_WITH_QUOTE + ZeroOrMore(COMMA + QUOTED_STRING_WITH_QUOTE))( 'enum_value_list') ENUM_DEF = (CaselessLiteral("ENUM")('column_type') + LEFT_PARENTHESES + ENUM_VALUE_LIST + RIGHT_PARENTHESES + CHARSET_DEF + COLLATE_DEF) SET_VALUE_LIST = Group(QUOTED_STRING_WITH_QUOTE + ZeroOrMore(COMMA + QUOTED_STRING_WITH_QUOTE))( 'set_value_list') SET_DEF = (CaselessLiteral("SET")('column_type') + LEFT_PARENTHESES + SET_VALUE_LIST + RIGHT_PARENTHESES + CHARSET_DEF + COLLATE_DEF) DATA_TYPE = (INT_DEF | FLOAT_DEF | DT_DEF | SIMPLE_DEF | TEXT_DEF | CHAR_DEF | VARCHAR_DEF | ENUM_DEF | SET_DEF | VARBINARY_DEF) # Column attributes come after column type and length NULLABLE = (CaselessLiteral("NULL") | CaselessLiteral("NOT NULL")) DEFAULT_VALUE = (CaselessLiteral("DEFAULT").suppress() + ( Optional(Literal('b'))('is_bit') + QUOTED_STRING_WITH_QUOTE('default') | Combine( CaselessLiteral("CURRENT_TIMESTAMP")('default') + Optional(COL_LEN) ('ts_len')) | Word(alphanums + '_' + '-' + '+')('default'))) ON_UPDATE = (CaselessLiteral("ON") + CaselessLiteral("UPDATE") + (CaselessLiteral("CURRENT_TIMESTAMP")('on_update') + Optional(COL_LEN)('on_update_ts_len'))) AUTO_INCRE = CaselessLiteral("AUTO_INCREMENT") UNIQ_KEY = (CaselessLiteral("UNIQUE") + Optional(CaselessLiteral("KEY")).suppress()) PRIMARY_KEY = (CaselessLiteral("PRIMARY") + Optional(CaselessLiteral("KEY")).suppress()) COMMENT = Combine(CaselessLiteral("COMMENT").suppress() + QUOTED_STRING_WITH_QUOTE, adjacent=False) COLUMN_DEF = Group(COLUMN_NAME + DATA_TYPE + ZeroOrMore( NULLABLE('nullable') | DEFAULT_VALUE | ON_UPDATE | AUTO_INCRE('auto_increment') | UNIQ_KEY('uniq_key') | PRIMARY_KEY('primary') | COMMENT('comment'))) COLUMN_LIST = Group(COLUMN_DEF + ZeroOrMore(COMMA + COLUMN_DEF))('column_list') DOCUMENT_PATH = Combine(COLUMN_NAME_WITH_QUOTE + ZeroOrMore(DOT + COLUMN_NAME_WITH_QUOTE)) IDX_COL = ((Group(DOCUMENT_PATH + CaselessLiteral('AS') + (CaselessLiteral('INT') | CaselessLiteral('STRING')) + Optional(COL_LEN, default=''))) | (Group(COLUMN_NAME + Optional(COL_LEN, default='')))) # Primary key section COL_NAME_LIST = Group(IDX_COL + ZeroOrMore(COMMA + IDX_COL)) IDX_COLS = (LEFT_PARENTHESES + COL_NAME_LIST + RIGHT_PARENTHESES) WORD_PRI_KEY = (CaselessLiteral("PRIMARY").suppress() + CaselessLiteral("KEY").suppress()) KEY_BLOCK_SIZE = (CaselessLiteral("KEY_BLOCK_SIZE").suppress() + Optional(Literal('=')) + Word(nums)('idx_key_block_size')) INDEX_USING = ( CaselessLiteral("USING").suppress() + (CaselessLiteral("BTREE") | CaselessLiteral("HASH"))('idx_using')) INDEX_OPTION = (ZeroOrMore(KEY_BLOCK_SIZE | COMMENT('idx_comment') | INDEX_USING)) PRI_KEY_DEF = (COMMA + WORD_PRI_KEY + IDX_COLS('pri_list') + INDEX_OPTION) # Index section KEY_TYPE = (CaselessLiteral("FULLTEXT") | CaselessLiteral("SPATIAL"))('key_type') WORD_UNIQUE = CaselessLiteral("UNIQUE")('unique') WORD_KEY = (CaselessLiteral("INDEX").suppress() | CaselessLiteral("KEY").suppress()) IDX_NAME = Optional(COLUMN_NAME) IDX_DEF = (ZeroOrMore( Group(COMMA + Optional(WORD_UNIQUE | KEY_TYPE) + WORD_KEY + IDX_NAME('index_name') + IDX_COLS('index_col_list') + INDEX_OPTION)))('index_section') # Constraint section as this is not a recommended way of using MySQL # we'll treat the whole section as a string CONSTRAINT = Combine( ZeroOrMore(COMMA + Optional(CaselessLiteral('CONSTRAINT')) + # foreign key name except the key word 'FOREIGN' Optional((~CaselessLiteral('FOREIGN') + COLUMN_NAME)) + CaselessLiteral('FOREIGN') + CaselessLiteral('KEY') + LEFT_PARENTHESES + COL_NAME_LIST + RIGHT_PARENTHESES + CaselessLiteral('REFERENCES') + COLUMN_NAME + LEFT_PARENTHESES + COL_NAME_LIST + RIGHT_PARENTHESES + ZeroOrMore(Word(alphanums))), adjacent=False, joinString=' ')('constraint') # Table option section ENGINE = (CaselessLiteral("ENGINE").suppress() + Optional(Literal('=')).suppress() + COLUMN_NAME('engine').setParseAction(upcaseTokens)) DEFAULT_CHARSET = (Optional(CaselessLiteral("DEFAULT")).suppress() + ((CaselessLiteral("CHARACTER").suppress() + CaselessLiteral("SET").suppress()) | (CaselessLiteral("CHARSET").suppress())) + Optional(Literal('=')).suppress() + Word(alphanums + '_')('charset')) TABLE_COLLATE = (Optional(CaselessLiteral("DEFAULT")).suppress() + CaselessLiteral("COLLATE").suppress() + Optional(Literal('=')).suppress() + COLLATION_NAME) ROW_FORMAT = ( CaselessLiteral("ROW_FORMAT").suppress() + Optional(Literal('=')).suppress() + Word(alphanums + '_')('row_format').setParseAction(upcaseTokens)) TABLE_KEY_BLOCK_SIZE = ( CaselessLiteral("KEY_BLOCK_SIZE").suppress() + Optional(Literal('=')).suppress() + Word(nums) ('key_block_size').setParseAction(lambda s, l, t: [int(t[0])])) COMPRESSION = ( CaselessLiteral("COMPRESSION").suppress() + Optional(Literal('=')).suppress() + Word(alphanums + '_')('compression').setParseAction(upcaseTokens)) # Parse and make sure auto_increment is an interger # parseAction function is defined as fn( s, loc, toks ), where: # s is the original parse string # loc is the location in the string where matching started # toks is the list of the matched tokens, packaged as a ParseResults_ # object TABLE_AUTO_INCRE = ( CaselessLiteral("AUTO_INCREMENT").suppress() + Optional(Literal('=')).suppress() + Word(nums) ('auto_increment').setParseAction(lambda s, l, t: [int(t[0])])) TABLE_COMMENT = (CaselessLiteral("COMMENT").suppress() + Optional(Literal('=')).suppress() + QUOTED_STRING_WITH_QUOTE('comment')) TABLE_OPTION = ZeroOrMore(ENGINE | DEFAULT_CHARSET | TABLE_COLLATE | ROW_FORMAT | TABLE_KEY_BLOCK_SIZE | COMPRESSION | TABLE_AUTO_INCRE | TABLE_COMMENT) # Partition section PARTITION = Optional( Combine(Combine(Optional(Literal('/*!') + Word(nums))) + CaselessLiteral("PARTITION") + CaselessLiteral("BY") + SkipTo(StringEnd()), adjacent=False, joinString=" ")('partition')) @classmethod def generate_rule(cls): # The final rule for the whole statement match return (cls.WORD_CREATE + cls.WORD_TABLE + cls.IF_NOT_EXIST + cls.TABLE_NAME + cls.LEFT_PARENTHESES + cls.COLUMN_LIST + Optional(cls.PRI_KEY_DEF) + cls.IDX_DEF + cls.CONSTRAINT + cls.RIGHT_PARENTHESES + cls.TABLE_OPTION('table_options') + cls.PARTITION) @classmethod def parse(cls, sql): try: result = cls.generate_rule().parseString(sql) except ParseException as e: raise ParseError( "Failed to parse SQL, unsupported syntax: {}".format(e), e.line, e.column) inline_pri_exists = False table = models.Table() table.name = result.table_name table_options = [ 'engine', 'charset', 'collate', 'row_format', 'key_block_size', 'compression', 'auto_increment', 'comment' ] for table_option in table_options: if table_option in result.table_options: setattr(table, table_option, result.table_options.get(table_option)) if 'partition' in result: table.partition = result.partition if 'constraint' in result: table.constraint = result.constraint for column_def in result.column_list: if column_def.column_type == 'ENUM': column = models.EnumColumn() for enum_value in column_def.enum_value_list: column.enum_list.append(enum_value) elif column_def.column_type == 'SET': column = models.SetColumn() for set_value in column_def.set_value_list: column.set_list.append(set_value) elif column_def.column_type in ('TIMESTAMP', 'DATETIME'): column = models.TimestampColumn() if 'on_update' in column_def: if 'on_update_ts_len' in column_def: column.on_update_current_timestamp = \ "{}({})".format( column_def.on_update, column_def.on_update_ts_len) else: column.on_update_current_timestamp = \ column_def.on_update else: column = models.Column() column.name = column_def.column_name column.column_type = column_def.column_type # We need to check whether each column property exist in the # create table string, because not specifying a "COMMENT" is # different from specifying "COMMENT" equals to empty string. # The former one will ends up being # column=None # and the later one being # column='' if 'comment' in column_def: column.comment = column_def.comment if 'nullable' in column_def: if column_def.nullable == 'NULL': column.nullable = True elif column_def.nullable == 'NOT NULL': column.nullable = False if 'unsigned' in column_def: if column_def.unsigned == 'UNSIGNED': column.unsigned = True if 'default' in column_def: if 'ts_len' in column_def: column.default = "{}({})".format(column_def.default, column_def.ts_len) else: column.default = column_def.default if 'is_bit' in column_def: column.is_default_bit = True if 'charset' in column_def: column.charset = column_def.charset if 'length' in column_def: column.length = column_def.length if 'collate' in column_def: column.collate = column_def.collate if 'auto_increment' in column_def: column.auto_increment = True if 'primary' in column_def: idx_col = models.IndexColumn() idx_col.name = column_def.column_name table.primary_key.column_list.append(idx_col) inline_pri_exists = True table.column_list.append(column) if 'pri_list' in result: if inline_pri_exists: raise ParseError("Multiple primary keys defined") table.primary_key.name = 'PRIMARY' for col in result.pri_list: for name, length in col: idx_col = models.IndexColumn() idx_col.name = name if length: idx_col.length = length table.primary_key.column_list.append(idx_col) if 'idx_key_block_size' in result: table.primary_key.key_block_size = result.pri_key_block_size if 'idx_comment' in result: table.primary_key.comment = result.idx_comment if 'index_section' in result: for idx_def in result.index_section: idx = models.TableIndex() idx.name = idx_def.index_name if 'idx_key_block_size' in idx_def: idx.key_block_size = idx_def.idx_key_block_size if 'idx_comment' in idx_def: idx.comment = idx_def.idx_comment if 'idx_using' in idx_def: idx.using = idx_def.idx_using if 'key_type' in idx_def: idx.key_type = idx_def.key_type if 'unique' in idx_def: idx.is_unique = True for col in idx_def.index_col_list: for col_def in col: if len(col_def) == 4 and col_def[1].upper() == 'AS': (document_path, word_as, key_type, length) = col_def idx_col = models.DocStoreIndexColumn() idx_col.document_path = document_path idx_col.key_type = key_type if length: idx_col.length = length idx.column_list.append(idx_col) else: (name, length) = col_def idx_col = models.IndexColumn() idx_col.name = name if length: idx_col.length = length idx.column_list.append(idx_col) table.indexes.append(idx) return table
field_value = string + ZeroOrMore(HASH + string) field_def = Group(field_name + EQUALS + field_value) entry_contents = Dict(ZeroOrMore(field_def + COMMA) + Optional(field_def)) # Entry is surrounded either by parentheses or curlies entry = (AT + entry_type + bracketed(cite_key + COMMA + entry_contents)) # Preamble is a macro-like thing with no name preamble = AT + CaselessLiteral('preamble') + bracketed(field_value) # Macros (aka strings) macro_contents = macro_def + EQUALS + field_value macro = AT + CaselessLiteral('string') + bracketed(macro_contents) # Implicit comments icomment = SkipTo('@').setParseAction(lambda t : t.insert(0, 'icomment')) # entries are last in the list (other than the fallback) because they have # arbitrary start patterns that would match comments, preamble or macro definitions = Group(comment | preamble | macro | entry | icomment) # Start symbol bibfile = ZeroOrMore(definitions) def parse_str(str): return bibfile.parseString(str)
def compute(self, text, verbose=True): # Literals dollar = Literal('$') amper = Literal('&') at = Literal('@') qm = Literal('?') em = Literal('!') dot = Literal('.') colon = Literal(":") vbar = Literal("|") lbrack = Literal("[") rbrack = Literal("]") lcurly = Literal("{") rcurly = Literal("}") lparen = Literal("(") rparen = Literal(")") lt = Literal("<") gt = Literal(">") eq = Literal("=") deq = Literal("==") # Reusables spellId = Word(nums, min=2, max=6).addParseAction( tokenMap(int)).setResultsName("spellId") idx = Word(nums, max=1).addParseAction(tokenMap(int)).setResultsName("id") var = Word(alphas).setResultsName("var") # Spell References effectId = Optional( Word(nums, max=2).addParseAction( tokenMap(int)).setResultsName("effectId")) references = (dollar.suppress() + ((at.suppress() + var + Optional(spellId)) | (spellId + var + effectId) | (var + effectId))).addParseAction(self.setReferences) # Conditions brackets = Suppress(lbrack) + SkipTo(rbrack).setResultsName( "statement") + Suppress(rbrack) value = Word(nums, max=5).addParseAction( tokenMap(int)).setResultsName("value") conditionVar = Group( Optional(em).setResultsName("not") + Optional(var) + (spellId | idx) | Optional("-") + value | Word(alphanums, exact=8).setResultsName("hashVariable")) conditions = ((dollar + qm).suppress() + OneOrMore( Group( Optional(Suppress(qm)) + Optional(Suppress(lparen)) + OneOrMore( conditionVar.setResultsName("variables*") + Optional(Combine(em + eq) | amper | vbar | deq | lt | gt).setResultsName("operators*")) + Optional(Suppress(rparen)) + brackets).setResultsName("conditions*")) + brackets).addParseAction(lambda t: self.setConditions( t, verbose=verbose)) + Optional(dot.suppress()) # Call Variable callVariables = (Suppress((lt + dollar) | (dollar + lt)) + SkipTo(gt).setResultsName("name") + Suppress(gt)).addParseAction(self.callVariables) # Expressions expressions = ( Suppress(dollar + lcurly) + SkipTo(rcurly).setResultsName("content") + rcurly + Optional( dot.suppress() + Word(nums, exact=1).addParseAction( tokenMap(int)).setResultsName("mod"), ) ).addParseAction(lambda t: self.setExpressions(t, verbose=verbose)) # Language Choices languageChoices = ( (Literal('$L') | Literal('$l')).suppress() + OneOrMore(Word(alphas) + Optional(Literal(":").suppress()) ).setResultsName("options*") + Literal(';').suppress()).addParseAction(self.setLanguageChoices) # Icons icons = (Literal("|T").suppress() + SkipTo(colon).setResultsName("path") + colon.suppress() + Word(nums, exact=2).addParseAction( tokenMap(int)).setResultsName("size") + Literal("|t").suppress()).addParseAction(self.setIcons) # Parsing layer by layer parsingOrder = [ icons, languageChoices, callVariables, references, expressions, conditions ] steps = [text] for parser in parsingOrder: steps.append(parser.transformString(steps[-1])) result = steps[-1] # Replace each Sha1 Hash placeholder by refering value if verbose: for k, v in self.variables.items(): result = result.replace(k, str(v)) # Display fixes displayFixes = [["*% of", "% of"], ["power)%", "power)"]] for bef, aft in displayFixes: result = result.replace(bef, aft) return super(SpellDescriptionParser, self).compute(result, verbose)
def labeled_float(label): return Suppress(SkipTo(label)) + Suppress(label) + FLOAT
class GiftParser(object): """Parser for GIFT format questions.""" # separators, which have been suppressed double_colon = sep('::') colon = sep(':') span = sep('..') left_curly = sep('{') right_curly = sep('}') equals = sep('=') tilda = sep('~') percent = sep('%') arrow = sep('->') pound = sep('#') dbl_fwd_slash = sep('//') # integer signs plus = Literal('+') minus = Literal('-') bool_true = (Literal('TRUE') | Literal('T')).setParseAction(make_true) bool_false = (Literal('FALSE') | Literal('F')).setParseAction(make_false) boolean = bool_true | bool_false plus_or_minus = plus | minus number = Word(nums) integer = Combine(Optional(plus_or_minus) + number).setParseAction(make_int) unsigned_float = Combine( Word(nums) + Optional(Word('.', nums))).setParseAction(make_float) signed_float = Combine( Optional(plus_or_minus) + Word(nums) + Optional(Word('.', nums))).setParseAction(make_float) blank_lines = Suppress(LineEnd() + OneOrMore(LineEnd())) comment = dbl_fwd_slash + restOfLine title = (double_colon + SkipTo(double_colon).setParseAction(strip_spaces)('title') + double_colon) task = SkipTo(left_curly).setParseAction(strip_spaces)('task') # Multiple choice questions with one correct answer. # # // question: 1 name: Grants tomb # ::Grants tomb::Who is buried in Grant's tomb in New York City? { # =Grant # ~No one # #Was true for 12 years # ~Napoleon # #He was buried in France # ~Churchill # #He was buried in England # ~Mother Teresa # #She was buried in India } # # Multiple choice questions with multiple right answers. # # What two people are entombed in Grant's tomb? { # ~%-100%No one # ~%50%Grant # ~%50%Grant's wife # ~%-100%Grant's father} eof_multi_choice_answer = equals | tilda | right_curly ext_eof_multi_choice_answer = pound | eof_multi_choice_answer # '# hello world ~' multi_choice_feedback = Combine( pound + SkipTo(eof_multi_choice_answer).setParseAction(strip_spaces)) # 'answer #' multi_choice_answer_text = SkipTo( ext_eof_multi_choice_answer).setParseAction(strip_spaces) weight = Combine(percent + integer + percent).setParseAction(make_int) multi_choice_answer = ( (Literal('=')('sign') | Literal('~')('sign') + Optional(weight, default=0)('weight')) + multi_choice_answer_text('answer') + Optional(multi_choice_feedback, default='')('feedback') ).setParseAction(set_multi_choice_answer) multi_choice_answers = OneOrMore(multi_choice_answer) multi_choice_question = ( Optional(title, default='') + task + left_curly + multi_choice_answers.setParseAction(set_multi_choice_answers) + right_curly).setParseAction(set_multi_choice_question) multi_choice_question.ignore(comment) # True-false questions. # Sample: # // question: 0 name: TrueStatement using {T} style # ::TrueStatement about Grant::Grant was buried in a tomb in NY.{T} # # // question: 0 name: FalseStatement using {FALSE} style # ::FalseStatement about sun::The sun rises in the West.{FALSE} true_false_feedback = Combine( pound + SkipTo(right_curly).setParseAction(strip_spaces)) true_false_answer = ( left_curly + boolean('answer') + Optional(true_false_feedback, default='')('feedback') + right_curly) true_false_question = ( Optional(title, default='') + task + true_false_answer).setParseAction(set_true_false_question) true_false_question.ignore(comment) # Short answer questions. # Samples: # Who's buried in Grant's tomb?{=Grant =Ulysses S. Grant =Ulysses Grant} # Two plus two equals {=four =4} eof_short_answer_answer = equals | right_curly ext_eof_short_answer = pound | eof_short_answer_answer short_answer_feedback = Combine( pound + SkipTo(eof_short_answer_answer).setParseAction(strip_spaces)) short_answer_text = SkipTo(ext_eof_short_answer).setParseAction( strip_spaces) short_answer = (equals + short_answer_text('answer') + Optional(short_answer_feedback, default='')('feedback') ).setParseAction(set_short_answer) short_answers = (OneOrMore(short_answer) + right_curly + LineEnd()) short_answer_question = (Optional(title, default='') + task + left_curly + short_answers.setParseAction(set_short_answers) ).setParseAction(set_short_answer_question) short_answer_question.ignore(comment) # Matching questions. # Sample: # Match the following countries with their corresponding capitals. { # =Canada -> Ottawa # =Italy -> Rome # =Japan -> Tokyo # =India -> New Delhi # } eof_match_answer = equals | right_curly ext_eof_match_answer = pound | equals | right_curly match_feedback = Combine( pound + SkipTo(eof_match_answer).setParseAction(strip_spaces)) lhs = SkipTo(arrow).setParseAction(strip_spaces) match_answer = (equals + lhs('lhs') + arrow + SkipTo(ext_eof_match_answer)('rhs') + Optional(match_feedback, default='')('feedback') ).setParseAction(set_match_answer) match_answers = (left_curly + match_answer + match_answer + OneOrMore(match_answer) + right_curly) match_question = (Optional(title, default='') + task + match_answers.setParseAction(set_match_answers) ).setParseAction(set_match_answer_question) match_question.ignore(comment) # Missing word questions. # # CB costs {~lots of money =nothing ~a small amount} to download. missing_word_answers = multi_choice_answers prefix = SkipTo(left_curly) suffix = Combine(OneOrMore(Word(alphanums))) missing_word_question = ( prefix('prefix') + left_curly + missing_word_answers.setParseAction(set_multi_choice_answers) + right_curly + suffix('suffix')).setParseAction(set_missing_word_question) # Numeric questions. # No support for multiple numeric answers. # Sample: When was Ulysses S. Grant born?{#1822:5} numeric_single_answer = ( left_curly + pound + signed_float.setParseAction(make_float)('answer') + Optional(colon + unsigned_float.setParseAction(make_float)('error')) + Optional(match_feedback, default='')('feedback') + right_curly) numeric_range_answer = (left_curly + pound + signed_float.setParseAction(make_float)('min') + span + signed_float.setParseAction(make_float)('max') + right_curly) numeric_answer = (numeric_range_answer | numeric_single_answer) numeric_question = (Optional(title, default='') + task + numeric_answer).setParseAction(set_numeric_question) numeric_question.ignore(comment) # Essay questions. # Write a short biography of Dag Hammarskjold. {} essay_answer = left_curly + right_curly essay_question = (Optional(title, default='') + task + essay_answer).setParseAction(set_essay_question) essay_question.ignore(comment) question = (essay_question | match_question | numeric_question | missing_word_question | multi_choice_question | true_false_question | short_answer_question) bnf = OneOrMore(question) @classmethod def parse(cls, text): try: return cls.bnf.parseString(text) except ParseException as e: logging.exception('Invalid GIFT syntax: %s', text) raise ParseError(e.msg) @classmethod def parse_questions(cls, text): """Parses a list new-line separated GIFT questions to.""" tree = cls.parse(text) return [GiftAdapter().convert_to_question(node) for node in tree]
def parsePossibleURL(t): # Workaround for PyParsing versions < 2.1.0, for which t is wrapped in an # extra level of nesting. See enthought/enable#224. if len(t) == 1: t = t[0] possibleURL, fallback = t return [urlparse.urlsplit(possibleURL), fallback] #Normal color declaration colorDeclaration = none | currentColor | colourValue urlEnd = (Literal(")").suppress() + Optional(Group(colorDeclaration), default=()) + StringEnd()) url = (CaselessLiteral("URL") + Literal("(").suppress() + Group(SkipTo(urlEnd, include=True).setParseAction(parsePossibleURL))) #paint value will parse into a (type, details) tuple. #For none and currentColor, the details tuple will be the empty tuple #for CSS color declarations, it will be (type, (R,G,B)) #for URLs, it will be ("URL", ((url tuple), fallback)) #The url tuple will be as returned by urlparse.urlsplit, and can be #an empty tuple if the parser has an error #The fallback will be another (type, details) tuple as a parsed #colorDeclaration, but may be the empty tuple if it is not present paintValue = url | colorDeclaration
from pyparsing import makeHTMLTags, SkipTo, htmlComment import urllib serverListPage = urllib.urlopen("http://www.yahoo.com") htmlText = serverListPage.read() serverListPage.close() aStart, aEnd = makeHTMLTags("A") link = aStart + SkipTo(aEnd).setResultsName("link") + aEnd link.ignore(htmlComment) for toks, start, end in link.scanString(htmlText): print toks.link, "->", toks.startA.href
# vim: set encoding=utf-8 from pyparsing import (LineStart, Literal, OneOrMore, Optional, Regex, SkipTo, srange, Suppress, Word, ZeroOrMore) from regparser.grammar import atomic, unified from regparser.grammar.utils import (DocLiteral, keep_pos, Marker, QuickSearchable) smart_quotes = QuickSearchable( Suppress(DocLiteral(u'“', "left-smart-quote")) + keep_pos( SkipTo(DocLiteral(u'”', "right-smart-quote"))).setResultsName("term")) e_tag = ( Suppress(Regex(r"<E[^>]*>")) + keep_pos(OneOrMore(Word(srange("[a-zA-Z-]")))).setResultsName("term") + Suppress(Literal("</E>"))) xml_term_parser = QuickSearchable( LineStart() + Optional(Suppress(unified.any_depth_p)) + e_tag.setResultsName("head") + ZeroOrMore((atomic.conj_phrases + e_tag).setResultsName("tail", listAllMatches=True)) + Suppress(ZeroOrMore(Regex(r",[a-zA-Z ]+,"))) + Suppress(ZeroOrMore((Marker("this") | Marker("the")) + Marker("term"))) + ((Marker("mean") | Marker("means")) | (Marker("refers") + ZeroOrMore(Marker("only")) + Marker("to")) | ( (Marker("has") | Marker("have")) + Marker("the") + Marker("same") + Marker("meaning") + Marker("as")))) key_term_parser = QuickSearchable( LineStart() + Optional(Suppress(unified.any_depth_p)) +
from pyparsing import Literal,Suppress,CharsNotIn,CaselessLiteral,\ Word,dblQuotedString,alphanums,SkipTo import urllib import pprint # Define the pyparsing grammar for a URL, that is: # URLlink ::= <a href= URL>linkText</a> # URL ::= doubleQuotedString | alphanumericWordPath # Note that whitespace may appear just about anywhere in the link. Note also # that it is not necessary to explicitly show this in the pyparsing grammar; by default, # pyparsing skips over whitespace between tokens. linkOpenTag = (Literal("<") + "a" + "href" + "=").suppress() + \ ( dblQuotedString | Word(alphanums+"/") ) + \ Suppress(">") linkCloseTag = Literal("<") + "/" + CaselessLiteral("a") + ">" link = linkOpenTag + SkipTo(linkCloseTag) + linkCloseTag.suppress() # Go get some HTML with some links in it. serverListPage = urllib.urlopen("http://www.yahoo.com") htmlText = serverListPage.read() serverListPage.close() # scanString is a generator that loops through the input htmlText, and for each # match yields the tokens and start and end locations (for this application, we are # not interested in the start and end values). for toks, strt, end in link.scanString(htmlText): print toks.asList() # Rerun scanString, but this time create a dict of text:URL key-value pairs. # Need to reverse the tokens returned by link, using a parse action. link.setParseAction(lambda st, loc, toks: [toks[1], toks[0]])
def __init__(self): self.ALPHA_LABEL = Regex(r'alpha\[\d+\]:') self.LNL_LABEL = Literal('Final GAMMA-based Score of best tree') self.FRQ_LABEL = Regex(r'Base frequencies: (?=\d+)') ^ \ Regex(r'ML estimate base freqs\[\d+\]:') self.NAMES_LABEL = Regex(r'Partition: \d+ with name:\s+') self.RATES_LABEL = Regex(r'rates\[\d+\].+?:') self.MODEL_LABEL = Literal('Substitution Matrix:') self.alpha = OneOrMore( Suppress(SkipTo(self.ALPHA_LABEL)) + Suppress(self.ALPHA_LABEL) + FLOAT) self.lnl = Suppress(SkipTo(self.LNL_LABEL)) + \ Suppress(self.LNL_LABEL) + FLOAT self.frq = OneOrMore( Group( Suppress(SkipTo(self.FRQ_LABEL)) + Suppress(self.FRQ_LABEL) + OneOrMore(FLOAT))) self.names = OneOrMore( Suppress(SkipTo(self.NAMES_LABEL)) + Suppress(self.NAMES_LABEL) + CharsNotIn('\n') + Suppress(LineEnd())) self.rates = OneOrMore( Group( Suppress(SkipTo(self.RATES_LABEL)) + Suppress(self.RATES_LABEL) + OneOrMore(FLOAT))) self.model = Suppress(SkipTo(self.MODEL_LABEL)) + \ Suppress(self.MODEL_LABEL) + WORD MODEL_LABEL = Literal('Substitution Matrix:') SCORE_LABEL = Literal('Final GAMMA likelihood:') BOOT_SCORE_LABEL = Literal('Final ML Optimization Likelihood:') DESC_LABEL = Literal('Model Parameters of Partition') NAME_LEADIN = Literal(', Name:') DATATYPE_LEADIN = Literal(', Type of Data:') ALPHA_LEADIN = Literal('alpha:') TREELENGTH_LEADIN = Literal('Tree-Length:') RATES_LABEL = Regex(r'rate \w <-> \w:') FREQS_LABEL = Regex(r'freq pi\(\w\):') likelihood = Suppress( SkipTo(SCORE_LABEL)) + Suppress(SCORE_LABEL) + FLOAT boot_likelihood = Suppress( SkipTo(BOOT_SCORE_LABEL)) + Suppress(BOOT_SCORE_LABEL) + FLOAT description = Suppress( SkipTo(DESC_LABEL)) + Suppress(DESC_LABEL) + INT + Suppress( NAME_LEADIN) + SPACEDWORD + Suppress(DATATYPE_LEADIN) + WORD treelen = Suppress( SkipTo(TREELENGTH_LEADIN)) + Suppress(TREELENGTH_LEADIN) + FLOAT alpha = Suppress(SkipTo(ALPHA_LEADIN)) + Suppress(ALPHA_LEADIN) + FLOAT rates = OneOrMore( Group( Suppress(SkipTo(RATES_LABEL)) + Suppress(RATES_LABEL) + OneOrMore(FLOAT))) freqs = OneOrMore( Group( Suppress(SkipTo(FREQS_LABEL)) + Suppress(FREQS_LABEL) + OneOrMore(FLOAT))) # output of running different set of raxml analysis self.TC_STOCHBI_LABEL = Literal( 'Tree certainty under stochastic bipartition ' 'adjustment for this tree:') self.RTC_STOCHBI_LABEL = Literal( 'Relative tree certainty under stochastic bipartition adjustment for this tree:' ) self.TCA_STOCHBI_LABEL = Literal( 'Tree certainty including all conflicting bipartitions (TCA) under ' 'stochastic bipartition adjustment for this tree:') self.RTCA_STOCHBI_LABEL = Literal( 'Relative tree certainty including all conflicting bipartitions (TCA) ' 'under stochastic bipartition adjustment for this tree:') self.TC_UNIBI_LABEL = Literal( 'Tree certainty under uniform bipartition ' 'adjustment for this tree:') self.RTC_UNIBI_LABEL = Literal('Relative tree certainty under uniform ' 'bipartition adjustment for this tree:') self.TCA_UNIBI_LABEL = Literal( 'Tree certainty including all conflicting bipartitions (TCA) under ' 'uniform bipartition adjustment for this tree:') self.RTCA_UNIBI_LABEL = Literal( 'Relative tree certainty including all conflicting bipartitions (TCA) ' 'under uniform bipartition adjustment for this tree:') self.tc_stochbi = Suppress(SkipTo(self.TC_STOCHBI_LABEL)) + Suppress( self.TC_STOCHBI_LABEL) + FLOAT self.rtc_stochbi = Suppress(SkipTo(self.RTC_STOCHBI_LABEL)) + Suppress( self.RTC_STOCHBI_LABEL) + FLOAT self.tca_stochbi = Suppress(SkipTo(self.TCA_STOCHBI_LABEL)) + Suppress( self.TCA_STOCHBI_LABEL) + FLOAT self.rtca_stochbi = Suppress(SkipTo( self.RTCA_STOCHBI_LABEL)) + Suppress( self.RTCA_STOCHBI_LABEL) + FLOAT self.tc_unibi = Suppress(SkipTo(self.TC_UNIBI_LABEL)) + Suppress( self.TC_UNIBI_LABEL) + FLOAT self.rtc_unibi = Suppress(SkipTo(self.RTC_UNIBI_LABEL)) + Suppress( self.RTC_UNIBI_LABEL) + FLOAT self.tca_unibi = Suppress(SkipTo(self.TCA_UNIBI_LABEL)) + Suppress( self.TCA_UNIBI_LABEL) + FLOAT self.rtca_unibi = Suppress(SkipTo(self.RTCA_UNIBI_LABEL)) + Suppress( self.RTCA_UNIBI_LABEL) + FLOAT # Use these for flag 'a' option self.boot_likelihood = boot_likelihood self.freqs = freqs self.rates = rates self.alpha = alpha self.name = description self.treelen = treelen self._dash_f_e_parser = ( Group(OneOrMore(self.model)) + likelihood + Group( OneOrMore( Group(description + alpha + Suppress(TREELENGTH_LEADIN) + Suppress(FLOAT) + Group(OneOrMore(rates)) + Group(OneOrMore(freqs))))))
def strParsing(self, rcg_string): left_p = Literal("(") right_p = Literal(")") frame_number = Word(nums) teamscore_result_name = Word(alphanums) teamscore_result_value = Word(alphanums) teamscore_result_score = Word(nums) # This needs to be taken care of by AST because some teams have '_' in their names teamscore_result = ( teamscore_result_name + "_" + teamscore_result_value + Optional("_" + teamscore_result_score)).setParseAction( rcgParsing.get_team_result) # Playmode # Playmode list play_mode_list = (Word(" play_on") ^ Word(" time_over") ^ Word(" free_kick_r") ^ Word(" free_kick_l") ^ Word(" indirect_free_kick_l") ^ Word(" indirect_free_kick_r") ^ Word(" kick_in_l") ^ Word(" kick_in_r") ^ Word(" foul_charge_r") ^ Word(" foul_charge_l") ^ Word(" kick_off_l") ^ Word(" kick_off_r") ^ Word(" corner_kick_l") ^ Word(" corner_kick_r") ^ Word(" offside_r") ^ Word(" offside_l") ^ Word(" foul_charge_l") ^ Word(" foul_charge_r") ^ Word(" goal_kick_l") ^ Word(" goal_kick_r") ^ Word(" penalty_setup_l") ^ Word(" penalty_setup_r") ^ Word(" penalty_ready_l") ^ Word(" penalty_ready_r") ^ Word(" penalty_taken_l") ^ Word(" penalty_taken_r") ^ Word(" penalty_miss_l") ^ Word(" penalty_miss_r") ^ Word(" penalty_score_r") ^ Word(" penalty_score_l")) play_mode = (Word("playmode ") + Word(nums) + play_mode_list).setParseAction( rcgParsing.goal_notification) # Teamname team_name = Combine( Word(alphanums) + Optional(OneOrMore((Literal("-") | Literal("_")) + Word(alphanums)))) # Teamscore team_score = Word("team ") + Word( nums) + team_name + team_name + Word(nums) * 2 team_score_penalty = Word("team ") + Word( nums) + team_name + team_name + Word(nums) * 6 # Frame and ball information show_frame = Word("show ") + frame_number.setParseAction( rcgParsing.get_current_frame) ball = left_p + left_p + Literal( "b") + right_p + Word(nums + "-.") * 4 + right_p # Player information player_number = left_p + (Word("r") ^ Word("l")) + Word(nums) + right_p # Player positions player_position = Word(alphanums + "-.") # Player view mode - H for high and L for low view_mode = left_p + Literal("v") + ( Word("h") ^ Word("l")) + Word(nums) + right_p stamina = left_p + Literal("s") + Word(nums + "-.") * 4 + right_p # Outer flag rules flag_pos = Word("lrbtc", max=1) field_side = Word("lr", max=1) distance_from_center = Word(nums) outer_flag = flag_pos + ZeroOrMore(field_side) + distance_from_center # Inner flag rules inner_flag_pos = Word("lrc", max=1) inner_flag = inner_flag_pos + (Word("b") ^ Word("t")) # Center flag center_flag = Literal("c") flag = left_p + Literal("f") + (outer_flag ^ inner_flag ^ center_flag) + right_p # Additional information additional = left_p + Literal("c") + Word(nums + "-.") * 11 + right_p player = left_p + player_number + ZeroOrMore( player_position) + view_mode + stamina + ZeroOrMore( flag) + additional + right_p # Start of game start = Word("ULG5") server_param = "server_param " + SkipTo(lineEnd) player_param = "player_param " + SkipTo(lineEnd) player_type = "player_type " + SkipTo(lineEnd) # End game - (msg 6000 1 "(result 201806211300 CYRUS2018_0-vs-HELIOS2018_1)") end_game = Word("result") + Word(nums) + teamscore_result + Suppress( "-vs-") + teamscore_result + Suppress(right_p) + Suppress( '"').setParseAction(rcgParsing.game_has_ended) team_graphic = (Word("team_graphic_l") ^ Word("team_graphic_r")) + SkipTo(lineEnd) msg = "msg" + frame_number + Word(nums) + Suppress('"') + Suppress( left_p) + (end_game | team_graphic) # Frame lines frame_line1 = show_frame + ball + (player * 11) frame_line2 = (player * 11) read_line = start ^ (left_p + (server_param ^ player_param ^ player_type ^ msg ^ ((frame_line1 + frame_line2) ^ play_mode ^ team_score ^ team_score_penalty) + right_p)) return read_line.parseString(rcg_string)
def _generate_report(self, params): """ _generate_report: generate summary report This will contain ALL the logic to generate the report, including areas that should/will be re-factored later """ # Get self.dfu = dfu(self.callback_url) # Get filepath of summary file summary_fp = os.path.join(os.getcwd(), 'outdir', 'genome_by_genome_overview.csv') summary_df = pd.read_csv(summary_fp, header=0, index_col=0) html = summary_df.to_html(index=False, classes='my_class table-striped" id = "my_id') # Need to file write below direct_html = html_template.substitute(html_table=html) # Find header so it can be copied to footer, as dataframe.to_html doesn't include footer start_header = Literal("<thead>") end_header = Literal("</thead>") text = start_header + SkipTo(end_header) new_text = '' for data, start_pos, end_pos in text.scanString(direct_html): new_text = ''.join(data).replace(' style="text-align: right;"', '').replace('thead>', 'tfoot>\n ') + '\n</tfoot>' # Get start and end positions to insert new text end_tbody = Literal("</tbody>") end_table = Literal("</table>") insertion_pos = end_tbody + SkipTo(end_table) final_html = '' for data, start_pos, end_pos in insertion_pos.scanString(direct_html): final_html = direct_html[:start_pos + 8] + '\n' + new_text + direct_html[start_pos + 8:] output_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_dir) result_fp = os.path.join(output_dir, 'index.html') with open(result_fp, 'w') as result_fh: result_fh.write(final_html) report_shock_id = self.dfu.file_to_shock({ 'file_path': output_dir, 'pack': 'zip' })['shock_id'] html_report = [{ 'shock_id': report_shock_id, 'name': os.path.basename(result_fp), 'label': os.path.basename(result_fp), 'description': 'HTML summary report for vConTACT2' }] report_params = {'message': 'Basic message to show in the report', 'workspace_name': params['workspace_name'], 'html_links': html_report, 'direct_html_link_index': 0, 'report_object_name': 'vConTACT_report_{}'.format(str(uuid.uuid4())), # Don't use until have files to attach to report # 'file_links': [{}], # Don't use until data objects that are created as result of running app # 'objects_created': [{'ref': matrix_obj_ref, # 'description': 'Imported Matrix'}], } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output
#!/usr/bin/env python import sys from pyparsing import (Word, Group, SkipTo, StringEnd, Suppress, ZeroOrMore, alphas, nums, alphanums, printables, oneOf) s = Suppress identifier = Word(printables)('id') definition = Word(alphas)('type') + Word(nums)('size') + \ identifier + Word(printables)('name') signal = Group(s('$var') + definition + s('$end'))('signal') content = SkipTo('$end')('content') + s('$end') section = Group(s('$') + Word(alphas)('name') + content)('section') unit = s('1') + oneOf('s ms ns us ps fs') timescale = (s('$timescale') + unit + s('$end'))('timescale') scope = Group(s('$scope module') + Word(printables)('module') + s('$end'))('scope') upscope = Group(s('$upscope') + s(content))('upscope') enddefinitions = s('$enddefinitions' + content) time = s('#') + Word(nums)('time') std_logic = oneOf('U X 0 1 Z W L H-')('std_logic') std_logic_vector = Word('b', 'UX01ZWLH-')('std_logic_vector')
porter = PorterStemmer() snowball = SnowballStemmer("english") isri = ISRIStemmer() rslp = RSLPStemmer() porter2 = Stemmer('english') endOfString = StringEnd() prefix = oneOf( "uni inter intro de con com anti pre pro per an ab ad af ac at as re in im ex en em un dis over sub syn out thermo philo geo for fore back" ) suffix = oneOf("ish") #suffix = oneOf("or er ed ish ian ary ation tion al ing ible able ate ly ment ism ous ness ent ic ive " # "ative tude ence ance ise ant age cide ium ion") word = (Optional(prefix)("prefixes") + SkipTo(suffix | suffix + FollowedBy(endOfString) | endOfString)("root") + ZeroOrMore(suffix | suffix + FollowedBy(endOfString))("suffix")) #word = (Optional(prefix)("prefixes") + SkipTo(FollowedBy(endOfString))("root")) for wd in wordlist: print wd stem = lanster.stem(wd) print "LansterStemmer:" + stem print "PorterStemmer2:" + porter2.stemWord(wd) #res = word.parseString(stem) #print res.dump() #print finally: file.close()
s_list_append = s_list.append #lookup append func once, instead of many times for tok in toks: if isinstance(tok, basestring): #See if it's a string s_list_append(' ' + tok) else: #If it's not a string s_list_append(normalise_templates(tok)) s_list_append(' >') return ''.join(s_list) #Skip pairs of brackets. angle_bracket_pair = nestedExpr( opener='<', closer='>').setParseAction(turn_parseresults_to_list) #TODO Fix for nesting brackets parentheses_pair = LPAR + SkipTo(RPAR) + RPAR square_bracket_pair = LBRACK + SkipTo(RBRACK) + RBRACK #The raw type of the input, i.e. 'int' in (unsigned const int * foo) #TODO I guess this should be a delimited list (by '::') of name and angle brackets input_type = Combine( Word(alphanums + ':_') + Optional(angle_bracket_pair + Optional(Word(alphanums + ':_')))) #A number. e.g. -1, 3.6 or 5 number = Word('-.' + nums) #The name of the argument. We will ignore this but it must be matched anyway. input_name = OneOrMore( Word(alphanums + '_') | angle_bracket_pair | parentheses_pair | square_bracket_pair)
def get_symptom(self): quitline = Literal("crash> quit") analyze_expression = Combine( Regex(".*KERNEL:") + SkipTo(Suppress(quitline), include=True)) return analyze_expression
ASSIGN_OP = Combine((Word("~%^&*-+|/") | ~Literal("==")) + Literal("=")) UNARY_OP = addspace(OneOrMore(Word("~-+") | Keyword("not"))) BINARY_OP = ~ASSIGN_OP + ( Word("!%^&*-+=|/<>") | Keyword("and") | Keyword("or") | addspace(OneOrMore(Keyword("is") | Keyword("not") | Keyword("in")))) OP = ASSIGN_OP | UNARY_OP | BINARY_OP TRAILER = DOT + NAME | PARENS | BRACKETS TRAILERS = condense(ZeroOrMore(TRAILER)) ATOM_BASE = NAME | NUM | PARENS | BRACKETS | BRACES | STRING ATOM = condense(ATOM_BASE + TRAILERS) UNARY_OP_ATOM = addspace(Optional(UNARY_OP) + ATOM) EXPR = addspace(UNARY_OP_ATOM + ZeroOrMore(BINARY_OP + UNARY_OP_ATOM)) HEADER = originalTextFor( START_OF_FILE + ZeroOrMore(SKIP_TO_TEXT + (STRING | pythonStyleComment | Optional(Keyword("from") + DOTTED_NAME) + Keyword("import") + SkipTo(NO_BS_NL)) + NL))
def __init__(self): self.parser = self.__init_parser() self.parser.ignore(T_COM + SkipTo(lineEnd))
+ END ) param_tabbing_stmt.setParseAction(ParamTabbingStmt) param_def_stmt = ( KW_PARAM + symbol.setResultsName("name") + Optional(subscript_domain) + param_default + END ) param_def_stmt.setParseAction(ParamDefStmt) stmts = set_stmt | set_def_stmt | param_def_stmt | param_stmt | param_tabbing_stmt grammar = ZeroOrMore(stmts) + StringEnd() grammar.ignore("#" + SkipTo(lineEnd)) grammar.ignore("end;" + SkipTo(lineEnd)) class Amply(object): """ Data parsing interface """ def __init__(self, s=""): """ Create an Amply parser instance @param s (default ""): initial string to parse """
# Only used as the top of the appendix hierarchy a1 = Word(string.digits).setResultsName("a1") aI = Word("IVXLCDM").setResultsName("aI") # Catches the A in 12A but not in 12Awesome markerless_upper = Word(string.ascii_uppercase).setResultsName( 'markerless_upper') + ~FollowedBy(Word(string.ascii_lowercase)) paren_upper = parenthesize(string.ascii_uppercase, "paren_upper") paren_lower = parenthesize(string.ascii_lowercase, "paren_lower") paren_digit = parenthesize(string.digits, "paren_digit") period_upper = decimalize(string.ascii_uppercase, "period_upper") period_lower = decimalize(string.ascii_lowercase, "period_lower") period_digit = decimalize(string.digits, "period_digit") section = (atomic.section_marker.copy().leaveWhitespace() + unified.part_section + SkipTo(LineEnd())) par = (atomic.section.copy().leaveWhitespace() + unified.depth1_p + SkipTo(LineEnd())) marker_par = (atomic.paragraph_marker.copy().leaveWhitespace() + atomic.section + unified.depth1_p) appendix = (atomic.appendix_marker.copy().leaveWhitespace() + atomic.appendix + SkipTo(LineEnd())) headers = utils.QuickSearchable(LineStart() + (section | marker_par | par | appendix))
s["content"] = toks[0] def set_head(string, loc, toks): s["type"] = type_head def set_subhead(string, loc, toks): s["type"] = type_subhead def set_title(string, loc, toks): s["type"] = type_title Label = (Literal("[") + SkipTo(']') + "]").setParseAction(set_label) Sentence = (Word(alphanums + " ")).setParseAction(set_content) Subhead = (Literal("+") + ZeroOrMore(Label) + Sentence).setParseAction(set_subhead) Headline = (Word(nums) + Literal(".") + ZeroOrMore(Label) + Sentence).setParseAction(set_head) Title = (Literal("*") + Sentence).setParseAction(set_title) Line = StringStart() + (Subhead | Headline | Title) # Read a todo list, generate a schedule for the scheduler to optimize. curr_headline = "" items = [] for line in open(sys.argv[1], "r"):
print(sexp.parseString(t)) print() with open('../tests/tex_files/reinhardt/reinhardt-optimal-control.tex', 'r') as rein_file: rein = rein_file.read() #with open('../tests/tex_files/short_xymatrix_example.tex') as xymatrix_file: # short_example = xymatrix_file.read() #with open('../../stacks-tests/orig/perfect.tex') as xymatrix_file: # stacks_example = xymatrix_file.read() # + cstikzfig = oneOf(["\\tikzfig", "\\mathcal"]).suppress() lbrace = Literal('{').suppress() rbrace = Literal('}').suppress() parens = Word("()%\\") inside = SkipTo(rbrace) allchars = Word(printables, excludeChars="{}") inside = ZeroOrMore(allchars) inside.setParseAction(lambda tok: " ".join(tok)) content = Forward() content << OneOrMore(allchars|(lbrace + ZeroOrMore(content) + rbrace)) #content << (allchars + lbrace + ZeroOrMore(content) + rbrace) content.setParseAction(lambda tok: " ".join(tok)) tikzfig = cstikzfig + lbrace + inside + rbrace + lbrace + inside + rbrace + lbrace + content + rbrace csxymatrix = oneOf(["\\xymatrix","\\mathcal"]).suppress() xymatrix = csxymatrix + lbrace + content + rbrace search_res = tikzfig.searchString(rein) search_res = xymatrix.searchString(short_example)
def parse(cls, content, basedir=None, resolve=True, unresolved_value=DEFAULT_SUBSTITUTION): """parse a HOCON content :param content: HOCON content to parse :type content: basestring :param resolve: if true, resolve substitutions :type resolve: boolean :param unresolved_value: assigned value value to unresolved substitution. If overriden with a default value, it will replace all unresolved value to the default value. If it is set to to pyhocon.STR_SUBSTITUTION then it will replace the value by its substitution expression (e.g., ${x}) :type unresolved_value: boolean :return: a ConfigTree or a list """ unescape_pattern = re.compile(r'\\.') def replace_escape_sequence(match): value = match.group(0) return cls.REPLACEMENTS.get(value, value) def norm_string(value): return unescape_pattern.sub(replace_escape_sequence, value) def unescape_string(tokens): return ConfigUnquotedString(norm_string(tokens[0])) def parse_multi_string(tokens): # remove the first and last 3 " return tokens[0][3:-3] def convert_number(tokens): n = tokens[0] try: return int(n, 10) except ValueError: return float(n) # ${path} or ${?path} for optional substitution SUBSTITUTION_PATTERN = r"\$\{(?P<optional>\?)?(?P<variable>[^}]+)\}(?P<ws>[ \t]*)" def create_substitution(instring, loc, token): # remove the ${ and } match = re.match(SUBSTITUTION_PATTERN, token[0]) variable = match.group('variable') ws = match.group('ws') optional = match.group('optional') == '?' substitution = ConfigSubstitution(variable, optional, ws, instring, loc) return substitution # ${path} or ${?path} for optional substitution STRING_PATTERN = '"(?P<value>(?:[^"\\\\]|\\\\.)*)"(?P<ws>[ \t]*)' def create_quoted_string(instring, loc, token): # remove the ${ and } match = re.match(STRING_PATTERN, token[0]) value = norm_string(match.group('value')) ws = match.group('ws') return ConfigQuotedString(value, ws, instring, loc) def include_config(instring, loc, token): url = None file = None required = False if token[0] == 'required': required = True final_tokens = token[1:] else: final_tokens = token if len(final_tokens) == 1: # include "test" value = final_tokens[0].value if isinstance( final_tokens[0], ConfigQuotedString) else final_tokens[0] if value.startswith("http://") or value.startswith( "https://") or value.startswith("file://"): url = value else: file = value elif len(final_tokens) == 2: # include url("test") or file("test") value = final_tokens[1].value if isinstance( token[1], ConfigQuotedString) else final_tokens[1] if final_tokens[0] == 'url': url = value else: file = value if url is not None: logger.debug('Loading config from url %s', url) obj = ConfigFactory.parse_URL(url, resolve=False, required=required, unresolved_value=NO_SUBSTITUTION) elif file is not None: path = file if basedir is None else os.path.join(basedir, file) logger.debug('Loading config from file %s', path) obj = ConfigFactory.parse_file( path, resolve=False, required=required, unresolved_value=NO_SUBSTITUTION) else: raise ConfigException( 'No file or URL specified at: {loc}: {instring}', loc=loc, instring=instring) return ConfigInclude(obj if isinstance(obj, list) else obj.items()) @contextlib.contextmanager def set_default_white_spaces(): default = ParserElement.DEFAULT_WHITE_CHARS ParserElement.setDefaultWhitespaceChars(' \t') yield ParserElement.setDefaultWhitespaceChars(default) with set_default_white_spaces(): assign_expr = Forward() true_expr = Keyword("true", caseless=True).setParseAction( replaceWith(True)) false_expr = Keyword("false", caseless=True).setParseAction( replaceWith(False)) null_expr = Keyword("null", caseless=True).setParseAction( replaceWith(NoneValue())) key = QuotedString( '"', escChar='\\', unquoteResults=False) | Word(alphanums + alphas8bit + '._- /') eol = Word('\n\r').suppress() eol_comma = Word('\n\r,').suppress() comment = (Literal('#') | Literal('//')) - SkipTo(eol | StringEnd()) comment_eol = Suppress(Optional(eol_comma) + comment) comment_no_comma_eol = (comment | eol).suppress() number_expr = Regex( r'[+-]?(\d*\.\d+|\d+(\.\d+)?)([eE][+\-]?\d+)?(?=$|[ \t]*([\$\}\],#\n\r]|//))', re.DOTALL).setParseAction(convert_number) # multi line string using """ # Using fix described in http://pyparsing.wikispaces.com/share/view/3778969 multiline_string = Regex( '""".*?"*"""', re.DOTALL | re.UNICODE).setParseAction(parse_multi_string) # single quoted line string quoted_string = Regex( r'"(?:[^"\\\n]|\\.)*"[ \t]*', re.UNICODE).setParseAction(create_quoted_string) # unquoted string that takes the rest of the line until an optional comment # we support .properties multiline support which is like this: # line1 \ # line2 \ # so a backslash precedes the \n unquoted_string = Regex( r'(?:[^^`+?!@*&"\[\{\s\]\}#,=\$\\]|\\.)+[ \t]*', re.UNICODE).setParseAction(unescape_string) substitution_expr = Regex(r'[ \t]*\$\{[^\}]+\}[ \t]*' ).setParseAction(create_substitution) string_expr = multiline_string | quoted_string | unquoted_string value_expr = number_expr | true_expr | false_expr | null_expr | string_expr include_content = (quoted_string | ( (Keyword('url') | Keyword('file')) - Literal('(').suppress() - quoted_string - Literal(')').suppress())) include_expr = (Keyword("include", caseless=True).suppress() + (include_content | (Keyword("required") - Literal('(').suppress() - include_content - Literal(')').suppress())) ).setParseAction(include_config) root_dict_expr = Forward() dict_expr = Forward() list_expr = Forward() multi_value_expr = ZeroOrMore(comment_eol | include_expr | substitution_expr | dict_expr | list_expr | value_expr | (Literal('\\') - eol).suppress()) # for a dictionary : or = is optional # last zeroOrMore is because we can have t = {a:4} {b: 6} {c: 7} which is dictionary concatenation inside_dict_expr = ConfigTreeParser( ZeroOrMore(comment_eol | include_expr | assign_expr | eol_comma)) inside_root_dict_expr = ConfigTreeParser( ZeroOrMore(comment_eol | include_expr | assign_expr | eol_comma), root=True) dict_expr << Suppress('{') - inside_dict_expr - Suppress('}') root_dict_expr << Suppress('{') - inside_root_dict_expr - Suppress( '}') list_entry = ConcatenatedValueParser(multi_value_expr) list_expr << Suppress('[') - ListParser(list_entry - ZeroOrMore( eol_comma - list_entry)) - Suppress(']') # special case when we have a value assignment where the string can potentially be the remainder of the line assign_expr << Group(key - ZeroOrMore(comment_no_comma_eol) - ( dict_expr | (Literal('=') | Literal(':') | Literal('+=')) - ZeroOrMore(comment_no_comma_eol) - ConcatenatedValueParser(multi_value_expr))) # the file can be { ... } where {} can be omitted or [] config_expr = ZeroOrMore(comment_eol | eol) + ( list_expr | root_dict_expr | inside_root_dict_expr) + ZeroOrMore(comment_eol | eol_comma) config = config_expr.parseString(content, parseAll=True)[0] if resolve: allow_unresolved = resolve and unresolved_value is not DEFAULT_SUBSTITUTION and unresolved_value is not MANDATORY_SUBSTITUTION has_unresolved = cls.resolve_substitutions( config, allow_unresolved) if has_unresolved and unresolved_value is MANDATORY_SUBSTITUTION: raise ConfigSubstitutionException( 'resolve cannot be set to True and unresolved_value to MANDATORY_SUBSTITUTION' ) if unresolved_value is not NO_SUBSTITUTION and unresolved_value is not DEFAULT_SUBSTITUTION: cls.unresolve_substitutions_to_value(config, unresolved_value) return config
# URL extractor # Copyright 2004, Paul McGuire from pyparsing import Literal,Suppress,CharsNotIn,CaselessLiteral,\ Word,dblQuotedString,alphanums,SkipTo,makeHTMLTags import urllib import pprint # Define the pyparsing grammar for a URL, that is: # URLlink ::= <a href= URL>linkText</a> # URL ::= doubleQuotedString | alphanumericWordPath # Note that whitespace may appear just about anywhere in the link. Note also # that it is not necessary to explicitly show this in the pyparsing grammar; by default, # pyparsing skips over whitespace between tokens. linkOpenTag, linkCloseTag = makeHTMLTags("a") link = linkOpenTag + SkipTo(linkCloseTag).setResultsName( "body") + linkCloseTag.suppress() # Go get some HTML with some links in it. serverListPage = urllib.urlopen("http://www.google.com") htmlText = serverListPage.read() serverListPage.close() # scanString is a generator that loops through the input htmlText, and for each # match yields the tokens and start and end locations (for this application, we are # not interested in the start and end values). for toks, strt, end in link.scanString(htmlText): print toks.startA.href, "->", toks.body # Create dictionary from list comprehension, assembled from each pair of tokens returned # from a matched URL. pprint.pprint(
def parse(content, basedir=None, resolve=True): """parse a HOCON content :param content: HOCON content to parse :type content: basestring :param resolve: If true, resolve substitutions :type resolve: boolean :return: a ConfigTree or a list """ def norm_string(value): for k, v in ConfigParser.REPLACEMENTS.items(): value = value.replace(k, v) return value def unescape_string(tokens): return ConfigUnquotedString(norm_string(tokens[0])) def parse_multi_string(tokens): # remove the first and last 3 " return tokens[0][3:-3] def convert_number(tokens): n = tokens[0] try: return int(n) except ValueError: return float(n) # ${path} or ${?path} for optional substitution SUBSTITUTION = "\$\{(?P<optional>\?)?(?P<variable>[^}]+)\}(?P<ws>\s*)" def create_substitution(instring, loc, token): # remove the ${ and } match = re.match(SUBSTITUTION, token[0]) variable = match.group('variable') ws = match.group('ws') optional = match.group('optional') == '?' substitution = ConfigSubstitution(variable, optional, ws, instring, loc) return substitution def include_config(token): url = None file = None if len(token) == 1: # include "test" if token[0].startswith("http://") or token[0].startswith( "https://") or token[0].startswith("file://"): url = token[0] else: file = token[0] elif len(token) == 2: # include url("test") or file("test") if token[0] == 'url': url = token[1] else: file = token[1] if url is not None: logger.debug('Loading config from url %s', url) obj = ConfigFactory.parse_URL(url, resolve=False) if file is not None: path = file if basedir is None else os.path.join(basedir, file) logger.debug('Loading config from file %s', path) obj = ConfigFactory.parse_file(path, required=False, resolve=False) return ConfigInclude(obj if isinstance(obj, list) else obj.items()) ParserElement.setDefaultWhitespaceChars(' \t') assign_expr = Forward() true_expr = Keyword("true", caseless=True).setParseAction(replaceWith(True)) false_expr = Keyword("false", caseless=True).setParseAction(replaceWith(False)) null_expr = Keyword("null", caseless=True).setParseAction( replaceWith(NoneValue())) key = QuotedString('"', escChar='\\', unquoteResults=False) | Word(alphanums + '._- ') eol = Word('\n\r').suppress() eol_comma = Word('\n\r,').suppress() comment = (Literal('#') | Literal('//')) - SkipTo(eol) comment_eol = Suppress(Optional(eol_comma) + comment) comment_no_comma_eol = (comment | eol).suppress() number_expr = Regex( '[+-]?(\d*\.\d+|\d+(\.\d+)?)([eE]\d+)?(?=$|[ \t]*([\$\}\],#\n\r]|//))', re.DOTALL).setParseAction(convert_number) # multi line string using """ # Using fix described in http://pyparsing.wikispaces.com/share/view/3778969 multiline_string = Regex( '""".*?"""', re.DOTALL | re.UNICODE).setParseAction(parse_multi_string) # single quoted line string quoted_string = QuotedString(quoteChar='"', escChar='\\', multiline=True) # unquoted string that takes the rest of the line until an optional comment # we support .properties multiline support which is like this: # line1 \ # line2 \ # so a backslash precedes the \n unquoted_string = Regex( r'(\\[ \t]*[\r\n]|[^\[\{\n\r\]\}#,=\$])+?(?=($|\$|[ \t]*(//|[\}\],#\n\r])))', re.DOTALL).setParseAction(unescape_string) substitution_expr = Regex('[ \t]*\$\{[^\}]+\}[ \t]*').setParseAction( create_substitution) string_expr = multiline_string | quoted_string | unquoted_string value_expr = number_expr | true_expr | false_expr | null_expr | string_expr include_expr = (Keyword("include", caseless=True).suppress() - ( quoted_string | ( (Keyword('url') | Keyword('file')) - Literal('(').suppress() - quoted_string - Literal(')').suppress()))) \ .setParseAction(include_config) dict_expr = Forward() list_expr = Forward() multi_value_expr = ZeroOrMore((Literal('\\') - eol).suppress() | comment_eol | include_expr | substitution_expr | dict_expr | list_expr | value_expr) # for a dictionary : or = is optional # last zeroOrMore is because we can have t = {a:4} {b: 6} {c: 7} which is dictionary concatenation inside_dict_expr = ConfigTreeParser( ZeroOrMore(comment_eol | include_expr | assign_expr | eol_comma)) dict_expr << Suppress('{') - inside_dict_expr - Suppress('}') list_entry = ConcatenatedValueParser(multi_value_expr) list_expr << Suppress('[') - ListParser( list_entry - ZeroOrMore(eol_comma - list_entry)) - Suppress(']') # special case when we have a value assignment where the string can potentially be the remainder of the line assign_expr << Group(key - ZeroOrMore(comment_no_comma_eol) - (dict_expr | Suppress(Literal('=') | Literal(':')) - ZeroOrMore(comment_no_comma_eol) - ConcatenatedValueParser(multi_value_expr))) # the file can be { ... } where {} can be omitted or [] config_expr = ZeroOrMore(comment_eol | eol) + ( list_expr | dict_expr | inside_dict_expr) + ZeroOrMore(comment_eol | eol_comma) config = config_expr.parseString(content, parseAll=True)[0] if resolve: ConfigParser.resolve_substitutions(config) return config
def make_parser(): ParserElement.setDefaultWhitespaceChars(' \t') EOL = OneOrMore(LineEnd()).suppress().setName("end of line") Spaces = OneOrMore(" ").suppress() # NOTE: These are not all 'printable' Unicode characters. # If needed, expand the alphas_extra variable. alphas_extra = ''.join(chr(x) for x in range(0x100, 0x350)) chars = printables + alphas8bit + alphas_extra Token = Word(chars) InlineComment = '#' - SkipTo(EOL) WholelineComment = LineStart() + '#' - restOfLine - EOL Argument = Token('arg').setName('argument') Variable = Token('var').setName('variable') KindObject = Keyword('kind')('object') KindVerb = Keyword('is')('verb') Kind = Named(Keyword('url') | Keyword('raw') | Keyword('text'))('arg') MatchObject = Named(Keyword('arg'))('object') data = Named(Keyword('data'))('object') MatchVerb = Named( Keyword('is') | Keyword('istype') | Keyword('matches') | Keyword('rewrite'))('verb').setName('verb') Pattern = Named(Group(OneOrMore(Spaces + Argument + EOL)))('arg').leaveWhitespace() ActionObject = Keyword('plumb')('object') ActionVerb = Named( Keyword('run') | Keyword('notify') | Keyword('download'))('verb') Action = Named(originalTextFor(OneOrMore(Argument)))('arg') ArgMatchClause = Group(MatchObject - MatchVerb - Variable - Pattern) DataMatchClause = Group(data - MatchVerb - Pattern) # Transform every 'data match' rule to an equivalent 'arg match' rule def data_to_arg(toks): assert (len(toks) == 1) toks[0][0] = 'arg' toks[0].insert(2, '{data}') return toks DataMatchClause.setParseAction(data_to_arg) KindClause = Group(KindObject - KindVerb - Kind) - EOL MatchClause = (DataMatchClause | ArgMatchClause) ActionClause = Group(ActionObject - ActionVerb - Action) - EOL MatchBlock = Group(ZeroOrMore(MatchClause('match-clause'))) ActionBlock = Group(OneOrMore(ActionClause('action-clause'))) # TODO: allow the excluded chars if they are escaped. RuleName = Word(chars, excludeChars='{ } [ ]')('rule-name') RuleHeading = Suppress('[') - RuleName - Suppress(']') - EOL Rule = Group(RuleHeading - KindClause('kind-clause') - MatchBlock('match-block') - ActionBlock('action-block')) RulesFile = OneOrMore(Rule) RulesFile.ignore(WholelineComment) RulesFile.ignore(InlineComment) for v in [MatchObject, ActionObject]: v.setName('object') for v in [MatchVerb, ActionVerb]: v.setName('verb') Kind.setName('kind') data.setName('object') Pattern.setName('pattern') Action.setName('action or url') KindClause.setName('kind clause') MatchClause.setName('match clause') ActionClause.setName('action clause') MatchBlock.setName('match block') ActionBlock.setName('action block') Rule.setName('rule') RuleName.setName('rule name') RulesFile.setName('rules') return RulesFile
ws = ' \t' ParserElement.setDefaultWhitespaceChars(ws) EOL = LineEnd().suppress() SOL = LineStart().leaveWhitespace() blankline = SOL + LineEnd() noIndentation = SOL + ~Word(ws).leaveWhitespace().suppress() indentation = SOL + Word(ws).leaveWhitespace().suppress() date = Combine( Word(nums, exact=4) + '-' + Word(nums, exact=2) + '-' + Word(nums, exact=2)) description = SkipTo(';' | EOL) accountName = SkipTo(Literal(' ') | Literal(';') | Literal('\n')) currency = Word(alphas + '£$') number = Word(nums + '-.,') amount = currency('currency') + number('value') postingLine = (indentation + accountName('account') + Optional(amount)('amount') + restOfLine + EOL) postings = OneOrMore(Group(postingLine)) transaction = (date('date') + description('description') + EOL + Group(postings)('postings')) # # Single statements # keyword = Word(alphanums) # singleValue = restOfLine
def skipSupress(z: str) -> ParserElement: """Skip until `z` and suppress the skipped values.""" return Suppress(SkipTo(z))
# URL extractor # Copyright 2004, Paul McGuire from pyparsing import makeHTMLTags, SkipTo, pyparsing_common import urllib.request from contextlib import closing import pprint linkOpenTag, linkCloseTag = makeHTMLTags('a') linkBody = SkipTo(linkCloseTag) linkBody.setParseAction(pyparsing_common.stripHTMLTags) linkBody.addParseAction(lambda toks: ' '.join(toks[0].strip().split())) link = linkOpenTag + linkBody("body") + linkCloseTag.suppress() # Go get some HTML with some links in it. with closing(urllib.request.urlopen("http://www.yahoo.com")) as serverListPage: htmlText = serverListPage.read().decode("UTF-8") # scanString is a generator that loops through the input htmlText, and for each # match yields the tokens and start and end locations (for this application, we are # not interested in the start and end values). for toks,strt,end in link.scanString(htmlText): print(toks.asList()) # Create dictionary from list comprehension, assembled from each pair of tokens returned # from a matched URL. pprint.pprint( dict((toks.body, toks.href) for toks,strt,end in link.scanString(htmlText)) )
def parse_section(start: str, end: str) -> ParserElement: """Read the lines from `start` to `end`.""" s = Literal('{}'.format(start)) e = Literal('{}'.format(end)) return Suppress(SkipTo(s)) + skipLine + SkipTo(e)
print(len(input_text.split("\n"))) for line in input_text.split("\n"): # print("Word " + str(i) + " --- " + line) line_p = line.replace("\xa0", " ") # line_p = unicodedata.normalize("NFC", line) word_text.append(line_p) word_bold = (Literal("**").suppress() + Concat( OneOrMore( Word(alphas) ^ Cleanup( Literal("(").suppress() + Word(alphas) + Literal(")").suppress()))) + Literal("**").suppress()) word_def = ( LineStart() + Optional(Word(nums + " /")).suppress() + Concat(SkipTo(Word("►¶"))).setResultsName("definition") + OneOrMore( Literal("►").suppress() + NotAny(Literal("►")).suppress() + Concat(SkipTo(oneOf(genders) ^ Word("|¶►") ^ LineEnd())).setResultsName("words") + Concat( Optional(OneOrMore( oneOf(genders) + Optional(Literal(" ")).suppress()), default="na").setResultsName("gender")) + Optional( ( SkipTo(Literal("¶")).suppress() + Literal("¶").suppress() + Concat(SkipTo(Literal("►") ^ LineEnd())) # SkipTo(Word("►¶")).suppress() ).setResultsName("sources"), default="na")) + Optional( (SkipTo(Literal("►►")).suppress() + Literal("►►").suppress() +
from pyparsing import Literal, Word, Optional, Combine, delimitedList, printables, alphas, commaSeparatedList, SkipTo expr = SkipTo("in") + commaSeparatedList reference = "Ted has a beard and moustache in the flashback to him meeting Barney for the first time. He is shown with a goatee in the flashback to 2002 in Double Date, and with similar facial hair in the flashback to Barney's days as Insane Duane's best friend in Symphony of Illumination." print expr.parseString( reference )