def do_draw(self, data): # 1 - Select a valid top-level domain (TLD) name # 2 - Check that the number of characters in our selected TLD won't # prevent us from generating at least a 1 character subdomain. # 3 - Randomize the TLD between upper and lower case characters. domain = data.draw( st.sampled_from(TOP_LEVEL_DOMAINS).filter(lambda tld: len( tld) + 2 <= self.max_length).flatmap(lambda tld: st.tuples( *[st.sampled_from([c.lower(), c.upper()]) for c in tld]).map(u"".join))) # The maximum possible number of subdomains is 126, # 1 character subdomain + 1 '.' character, * 126 = 252, # with a max of 255, that leaves 3 characters for a TLD. # Allowing any more subdomains would not leave enough # characters for even the shortest possible TLDs. elements = cu.many(data, min_size=1, average_size=1, max_size=126) while elements.more(): # Generate a new valid subdomain using the regex strategy. sub_domain = data.draw( st.from_regex(self.label_regex, fullmatch=True)) if len(domain) + len(sub_domain) >= self.max_length: data.stop_example(discard=True) break domain = sub_domain + "." + domain return domain
def __init__(self, grammar, start, explicit): assert isinstance(grammar, lark.lark.Lark) if start is None: start = grammar.options.start if not isinstance(start, list): start = [start] self.grammar = grammar if "start" in getfullargspec(grammar.grammar.compile).args: terminals, rules, ignore_names = grammar.grammar.compile(start) else: # pragma: no cover # This branch is to support lark <= 0.7.1, without the start argument. terminals, rules, ignore_names = grammar.grammar.compile() self.names_to_symbols = {} for r in rules: t = r.origin self.names_to_symbols[t.name] = t for t in terminals: self.names_to_symbols[t.name] = Terminal(t.name) self.start = st.sampled_from([self.names_to_symbols[s] for s in start]) self.ignored_symbols = (st.sampled_from( [self.names_to_symbols[n] for n in ignore_names]) if ignore_names else st.nothing()) self.terminal_strategies = { t.name: st.from_regex(t.pattern.to_regexp(), fullmatch=True) for t in terminals } unknown_explicit = set(explicit) - get_terminal_names( terminals, rules, ignore_names) if unknown_explicit: raise InvalidArgument( "The following arguments were passed as explicit_strategies, " "but there is no such terminal production in this grammar: %r" % (sorted(unknown_explicit), )) self.terminal_strategies.update(explicit) nonterminals = {} for rule in rules: nonterminals.setdefault(rule.origin.name, []).append(tuple(rule.expansion)) for v in nonterminals.values(): v.sort(key=len) self.nonterminal_strategies = { k: st.sampled_from(v) for k, v in nonterminals.items() } self.__rule_labels = {}
def __init__(self, grammar, start=None): check_type(lark.lark.Lark, grammar, "grammar") if start is None: start = grammar.options.start if not isinstance(start, list): start = [start] self.grammar = grammar if "start" in getfullargspec(grammar.grammar.compile).args: terminals, rules, ignore_names = grammar.grammar.compile(start) else: # pragma: no cover # This branch is to support lark <= 0.7.1, without the start argument. terminals, rules, ignore_names = grammar.grammar.compile() self.names_to_symbols = {} for r in rules: t = r.origin self.names_to_symbols[t.name] = t for t in terminals: self.names_to_symbols[t.name] = Terminal(t.name) self.start = st.sampled_from([self.names_to_symbols[s] for s in start]) self.ignored_symbols = (st.sampled_from( [self.names_to_symbols[n] for n in ignore_names]) if ignore_names else st.nothing()) self.terminal_strategies = { t.name: st.from_regex(t.pattern.to_regexp(), fullmatch=True) for t in terminals } nonterminals = {} for rule in rules: nonterminals.setdefault(rule.origin.name, []).append(tuple(rule.expansion)) for v in nonterminals.values(): v.sort(key=len) self.nonterminal_strategies = { k: st.sampled_from(v) for k, v in nonterminals.items() } self.__rule_labels = {}
def __init__(self, grammar, start=None): check_type(lark.lark.Lark, grammar, "grammar") if start is None: start = grammar.options.start self.grammar = grammar terminals, rules, ignore_names = grammar.grammar.compile() self.names_to_symbols = {} for r in rules: t = r.origin self.names_to_symbols[t.name] = t for t in terminals: self.names_to_symbols[t.name] = Terminal(t.name) self.start = self.names_to_symbols[start] self.ignored_symbols = ( st.sampled_from([self.names_to_symbols[n] for n in ignore_names]) if ignore_names else st.nothing() ) self.terminal_strategies = { t.name: st.from_regex(t.pattern.to_regexp(), fullmatch=True) for t in terminals } nonterminals = {} for rule in rules: nonterminals.setdefault(rule.origin.name, []).append(tuple(rule.expansion)) for v in nonterminals.values(): v.sort(key=len) self.nonterminal_strategies = { k: st.sampled_from(v) for k, v in nonterminals.items() } self.__rule_labels = {}
def from_lark(grammar, start=None): # type: (lark.lark.Lark, Text) -> st.SearchStrategy[Text] """A strategy for strings accepted by the given context-free grammar. ``grammar`` must be a ``Lark`` object, which wraps an EBNF specification. The Lark EBNF grammar reference can be found `here <https://lark-parser.readthedocs.io/en/latest/grammar/>`_. ``from_lark`` will automatically generate strings matching the nonterminal ``start`` symbol in the grammar, which was supplied as an argument to the Lark class. To generate strings matching a different symbol, including terminals, you can override this by passing the ``start`` argument to ``from_lark``. """ check_type(lark.lark.Lark, grammar, "grammar") if start is None: start = grammar.options.start # Compiling the EBNF grammar to a sanitised and canonicalised BNF # format makes further transformations much easier. terminals, rules, ignore_names = grammar.grammar.compile() # Map all terminals to the corresponging regular expression, and # thence to a strategy for producing matching strings. # We'll add strategies for non-terminals to this mapping later. strategies = { t.name: st.from_regex(t.pattern.to_regexp(), fullmatch=True) for t in terminals } if start in strategies: return strategies[start] # Reshape our flat list of rules into a dict of rulename to list of # possible productions for that rule. We sort productions by increasing # number of parts as a heuristic for shrinking order. nonterminals = { origin.name: sorted([rule.expansion for rule in rules if rule.origin == origin], key=len) for origin in set(rule.origin for rule in rules) } @st.cacheable @st.defines_strategy_with_reusable_values def convert(expansion): parts = [] for p in expansion: if parts and ignore_names: # Chance to insert ignored substrings between meaningful # tokens, e.g. whitespace between values in JSON. parts.append( st.just(u"") | st.one_of([strategies[name] for name in ignore_names])) if p.name in strategies: # This might be a Terminal, or it might be a NonTerminal # that we've previously handled. parts.append(strategies[p.name]) else: # It must be the first time we've encountered this NonTerminal. # Recurse to handle it, relying on lazy strategy instantiation # to allow forward references, then add it to the strategies # cache to avoid infinite loops. assert isinstance(p, lark.grammar.NonTerminal) s = st.one_of([convert(ex) for ex in nonterminals[p.name]]) parts.append(s) strategies[p.name] = s # Special-case rules with only one expansion; it's worthwhile being # efficient when this includes terminals! Otherwise, join the parts. if len(parts) == 1: return parts[0] return st.tuples(*parts).map(u"".join) # Most grammars describe several production rules, so we check the start # option passed to Lark to see which nonterminal we're going to produce. return st.one_of([convert(ex) for ex in nonterminals[start]])