class Expando(List): grammar = '[', attr('begin', re.compile(r'\d+')), ['-', ':'], \ attr('end', re.compile(r'\d+')), ']' def _build(self, rr): for e in self: rr._expandos.append(e) return
class LessEqualOp(UnaryRule): """Less than or Equal to operator. Supports queries like date <= 10-2000 or author-count 100-. """ grammar = [ (omit(Literal("<=")), attr('op', SimpleValue)), # Accept a number or numbers that are separated with (/ or -) followed by a "-" which should be # followed by \s or ) or end of input so that you don't accept a value like 1-e. (attr('op', re.compile(r"\d+([/-]\d+)*(?=-)")), omit(re.compile(r'-(?=\s|\)|$)'))), ]
class LessEqualOp(UnaryRule): """Less than or Equal to operator. Supports queries like date <= 10-2000 or author-count 100-. """ grammar = [ (omit(Literal("<=")), attr('op', SimpleValue)), # Accept a number or anything that doesn't contain {whitespace, (, ), :} followed by a "-" which should be # followed by \s or ) or end of input so that you don't accept a value that is 1-e. (attr('op', re.compile(r"\d+")), omit(re.compile(r'-(?=\s|\)|$)'))), (attr('op', re.compile(r"[^\s():]+(?=( -|-))")), omit(re.compile(r'\+(?=\s|\)|$)'))), ]
class And(CIKeyword): """ The reason for defining an Enum grammar of Keywords is for populating the Keyword.table for checking whether terminal symbols are actually DSL keywords. """ regex = re.compile(r"(and|\+|&)", re.IGNORECASE) grammar = Enum(K("and"), K("+"), K("&"))
class Not(CIKeyword): """ The reason for defining an Enum grammar of Keywords is for populating the Keyword.table for checking whether terminal symbols are actually DSL keywords. """ regex = re.compile(r"(not|-)", re.IGNORECASE) grammar = Enum(K("not"), K("-"))
class Operator(Keyword): """ Operator in transaction output condition """ grammar = Enum(K("&&"), K("||"), K("AND"), K("OR")) regex = re.compile(r"[&&|\|\||\w]+") @classmethod def token(cls: Type[OperatorType], keyword: str) -> OperatorType: """ Return Operator instance from keyword :param keyword: Operator keyword in expression :return: """ op = cls(keyword) return op def compose( self, parser: Any = None, grammar: Any = None, attr_of: str = None ) -> str: """ Return the Operator keyword as string format :param parser: Parser instance :param grammar: Grammar :param attr_of: Attribute of... """ return "{0}".format(self.name)
class GreaterThanOp(UnaryRule): """Greater than operator. Supports queries like author-count > 2000 or date after 10-2000. """ grammar = omit(re.compile(r"after|>", re.IGNORECASE)), attr('op', SimpleValue)
class LessThanOp(UnaryRule): """Less than operator. Supports queries like author-count < 100 or date before 1984. """ grammar = omit(re.compile(r"before|<", re.IGNORECASE)), attr('op', SimpleValue)
class Or(CIKeyword): """ The reason for defining an Enum grammar of Keywords is for populating the Keyword.table for checking whether terminal symbols are actually DSL keywords. """ regex = re.compile(r"(or|\|)", re.IGNORECASE) grammar = Enum(K("or"), K("|")) def __init__(self, *args): # Normalize different OR keywords (ignore the keyword argument that was passed). super(Or, self).__init__(BooleanOperator.OR)
class Query(ListRule): """The entry-point for the grammar. Find keyword is ignored as the current grammar is an augmentation of SPIRES and Invenio style syntaxes. It only serves for backward compatibility with SPIRES syntax. """ grammar = [ (omit(optional(re.compile(r"(find|fin|fi|f)\s", re.IGNORECASE))), (Statement, maybe_some(MalformedQueryWords))), MalformedQueryWords, EmptyQuery, ]
class ComplexValue(LeafRule): """Accepting value with either single/double quotes or a regex value (/^.../$). These values have special and different meaning for the later phases of parsing: * Single quotes: partial text matching (text is analyzed before searched) * Double quotes: exact text matching * Regex: regex searches E.g. t 'Millisecond pulsar velocities'. This makes no difference for the parser and will be handled at a later parsing phase. """ regex = re.compile(r"((/.+?/)|('.*?')|(\".*?\"))") grammar = attr('value', regex)
class InvenioKeywordQuery(BinaryRule): """Keyword queries with colon separator (i.e. Invenio style). There needs to be a distinction between Invenio and SPIRES keyword queries, so as the parser is able to recognize any terminal as keyword for the former ones. Note: "arxiv:arxiv_identifier" should be excluded from the generic keyword pattern as it is a special case of SimpleValue, since it contains ":". E.g. author: ellis, title: boson, or unknown_keyword: foo. """ grammar = attr('left', [InspireKeyword, re.compile(r"(?!arxiv)[^\s:]+")]), \ omit(':'), \ attr('right', Value)
class InspireKeyword(LeafRule): # InspireKeyword expects a word boundary at its end, excluding [.,] characters, since these might signify names. grammar = re.compile( r"({0})(?![,.])(?=(:|\b))".format("|".join( INSPIRE_PARSER_KEYWORDS.keys())), re.IGNORECASE) def __init__(self, value): self.value = INSPIRE_PARSER_KEYWORDS[value.lower()] @classmethod def parse(cls, parser, text, pos): """Parse InspireKeyword. """ try: remaining_text, keyword = parser.parse(text, cls.grammar) return remaining_text, InspireKeyword(keyword) except SyntaxError as e: return text, e
class InspireKeyword(LeafRule): # InspireKeyword expects a word boundary at its end, excluding [.,] characters, since these might signify names. grammar = re.compile(r"({0})(?![,.])(?=(:|\b))".format("|".join(INSPIRE_PARSER_KEYWORDS.keys())), re.IGNORECASE) def __init__(self, value): self.value = INSPIRE_PARSER_KEYWORDS[value.lower()] @classmethod def parse(cls, parser, text, pos): """Parse InspireKeyword. If the keyword is `texkey`, enable the parsing texkey expression flag, since its value contains ':' which normally isn't allowed. """ try: remaining_text, keyword = parser.parse(text, cls.grammar) if keyword.lower() == 'texkey': parser._parsing_texkey_expression = True return remaining_text, InspireKeyword(keyword) except SyntaxError as e: parser._parsing_texkey_expression = False return text, e
# -*- coding: utf-8 -*- from __future__ import unicode_literals, print_function from pypeg2 import Symbol, Enum, List, K from pypeg2 import attr, re, some, maybe_some, optional Symbol.regex = re.compile(r'[\w\&\-]+') class Operator(Symbol): grammar = Enum(K("&"), K("-")) def _build(self, rr): rr._ops.append(self[0]) rr._nextop = self[0] return class Expando(List): grammar = '[', attr('begin', re.compile(r'\d+')), ['-', ':'], \ attr('end', re.compile(r'\d+')), ']' def _build(self, rr): for e in self: rr._expandos.append(e) return class StringPart(str): grammar = attr('part', re.compile(r'[\-_a-z0-9\.]+'))
class StringPart(str): grammar = attr('part', re.compile(r'[\-_a-z0-9\.]+')) def _build(self, rr): rr._strings.append(self[0]) return
class MalformedQueryWords(ListRule): """Represents queries that weren't recognized by the main parsing branch of Statements.""" grammar = some(re.compile(r"[^\s]+", re.UNICODE)) def __init__(self, children): self.children = children
class Pubkey(str): """ Pubkey in transaction output condition """ regex = re.compile(PUBKEY_REGEX)
class Pattern(str): grammar = re.compile(r'^\/.*\/$') def _build(self, rr): rr._patterns.append(self[0]) return
class SimpleValueWithColonUnit(SimpleValueUnit): token_regex = re.compile(r"[^\s)(]+[^\s:)(]", re.UNICODE)
class SimpleRangeValue(LeafRule): grammar = attr('value', re.compile(r"([^\s)(-]|-+[^\s)(>])+"))
class Hash(str): """ Hash in transaction output condition """ regex = re.compile(HASH_REGEX)
class SimpleValueUnit(LeafRule): """Represents either a terminal symbol (without parentheses) or a parenthesized SimpleValue. The parenthesized case (2nd option of SimpleValueUnit) accepts a SimpleValue which is the more generic case of plaintext and in turn (its grammar) encapsulates whitespace and SimpleValueUnit recognition. """ token_regex = re.compile(r"[^\s:)(]+", re.UNICODE) arxiv_token_regex = re.compile(r"(arxiv:)(" + token_regex.pattern + ")", re.IGNORECASE) """Arxiv identifiers are special cases of tokens where the ":" symbol is allowed.""" date_specifiers_regex = re.compile( r"({})\s*-\s*\d+".format('|'.join(DATE_SPECIFIERS_COLLECTION)), re.UNICODE) parenthesized_token_grammar = None # is set after SimpleValue definition. starts_with_colon = re.compile(r"\s*:", re.UNICODE) """Used for recognizing whether terminal token is a keyword (i.e. followed by some whitespace and ":".""" def __init__(self, args): super(SimpleValueUnit, self).__init__() if isinstance(args, six.string_types): # Value was recognized by the 1st option of the list grammar (regex) self.value = args else: # Value was recognized by the 2nd option of the list grammar self.value = args[0] + args[1].value + args[2] @classmethod def parse_terminal_token(cls, parser, text): """Parses a terminal token that doesn't contain parentheses nor colon symbol. Note: Handles a special case of tokens where a ':' is needed (for `texkey` queries). If we're parsing text not in parentheses, then some DSL keywords (e.g. And, Or, Not, defined above) should not be recognized as terminals, thus we check if they are in the Keywords table (namespace like structure handled by PyPeg). This is done only when we are not parsing a parenthesized SimpleValue. Also, helps in supporting more implicit-and queries cases (last two checks). """ token_regex = cls.token_regex match = token_regex.match(text) if match: matched_token = match.group(0) # Check if token is a DSL keyword. Disable this check in the case where the parser isn't parsing a # parenthesized terminal. if not parser._parsing_parenthesized_terminal and matched_token.lower( ) in Keyword.table: return text, SyntaxError("found DSL keyword: " + matched_token) remaining_text = text[len(matched_token):] # Attempt to recognize whether current terminal is followed by a ":", which definitely signifies that # we are parsing a keyword, and we shouldn't. if cls.starts_with_colon.match(remaining_text): return text, \ SyntaxError("parsing a keyword (token followed by \":\"): \"" + repr(matched_token) + "\"") # Attempt to recognize whether current terminal is a non shortened version of Inspire keywords. This is # done for supporting implicit-and in case of SPIRES style keyword queries. Using the non shortened version # of the keywords, makes this recognition not eager. if not parser._parsing_parenthesized_simple_values_expression \ and matched_token in INSPIRE_KEYWORDS_SET: return text, SyntaxError( "parsing a keyword (non shortened INSPIRE keyword)") result = remaining_text, matched_token else: result = text, SyntaxError("expecting match on " + repr(cls.token_regex.pattern)) return result @classmethod def parse(cls, parser, text, pos): """Imitates parsing a list grammar. Specifically, this grammar = [ SimpleValueUnit.date_specifiers_regex, SimpleValueUnit.arxiv_token_regex, SimpleValueUnit.token_regex, SimpleValueUnit.parenthesized_token_grammar ]. Parses plaintext which matches date specifiers or arxiv_identifier syntax, or is comprised of either 1) simple terminal (no parentheses) or 2) a parenthesized SimpleValue. For example, "e(+)" will be parsed in two steps, first, "e" token will be recognized and then "(+)", as a parenthesized SimpleValue. """ found = False # Attempt to parse date specifier match = cls.date_specifiers_regex.match(text) if match: remaining_text, token, found = text[len(match.group(0) ):], match.group(0), True else: # Attempt to parse arxiv identifier match = cls.arxiv_token_regex.match(text) if match: remaining_text, token, found = text[len(match.group( )):], match.group(2), True else: # Attempt to parse a terminal token remaining_text, token = cls.parse_terminal_token(parser, text) if type(token) != SyntaxError: found = True else: # Attempt to parse a terminal with parentheses try: # Enable parsing a parenthesized terminal so that we can accept {+, -, |} as terminals. parser._parsing_parenthesized_terminal = True remaining_text, token = parser.parse( text, cls.parenthesized_token_grammar, pos) found = True except SyntaxError: pass except GrammarValueError: raise except ValueError: pass finally: parser._parsing_parenthesized_terminal = False if found: result = remaining_text, cls(token) else: result = text, SyntaxError("expecting match on " + cls.__name__) return result
class Int(str): """ Integer in transaction output condition """ regex = re.compile(r"[0-9]+")
class InspireKeyword(LeafRule): grammar = re.compile(r"({0})(?=(:|\s))".format("|".join( INSPIRE_PARSER_KEYWORDS.keys()))) def __init__(self, value): self.value = INSPIRE_PARSER_KEYWORDS[value]
else: result = text[len(match.group(0)):], cls(match.group(0)) else: result = text, SyntaxError("expecting " + repr(cls.__name__)) return result def __str__(self): return self.name def __repr__(self): return "%s()" % self.__class__.__name__ CIKeyword = CaseInsensitiveKeyword u_word = re.compile("\w+", re.UNICODE) # ######################## class BooleanOperator(object): """Serves as the possible case for a boolean operator.""" AND = 'and' OR = 'or' class LeafRule(ast.Leaf): def __init__(self, value=None): if value: super(LeafRule, self).__init__(value)
class MalformedQueryText(LeafRule): """Represents queries that weren't recognized by the main parsing branch of Statements.""" grammar = some(re.compile(r"[^\s]+", re.UNICODE)) def __init__(self, values): self.value = ' '.join([v for v in values])
# -*- coding: utf-8 -*- from __future__ import unicode_literals, print_function from pypeg2 import Symbol, Enum, List, K from pypeg2 import attr, re, some, maybe_some, optional Symbol.regex = re.compile(r"[\w\&\-]+") class Operator(Symbol): grammar = Enum(K("&"), K("-")) def _build(self, rr): rr._ops.append(self[0]) rr._nextop = self[0] return class Expando(List): grammar = "[", attr("begin", re.compile(r"\d+")), ["-", ":"], attr("end", re.compile(r"\d+")), "]" def _build(self, rr): for e in self: rr._expandos.append(e) return class StringPart(str): grammar = attr("part", re.compile(r"[\-_a-z0-9\.]+")) def _build(self, rr):