def simple_tokenizer(self, expression): """ Return an iterable of Token describing each token given an expression unicode string. The split is done on spaces, keywords and parens. Anything else is a symbol token, e.g. a typically license key or license id (that contains no spaces or parens). If symbols were provided when this Licensing object was created, the tokenizer will recognize known symbol keys (ignoring case) when tokenizing expressions. """ symbols = self.known_symbols_lowercase or {} for match in _simple_tokenizer(expression): if not match: continue # set start and end as string indexes start, end = match.span() end = end - 1 match_getter = match.groupdict().get space = match_getter('space') if space: yield Token(start, end, space, None) lpar = match_getter('lpar') if lpar: yield Token(start, end, lpar, KW_LPAR) rpar = match_getter('rpar') if rpar: yield Token(start, end, rpar, KW_RPAR) sym_or_op = match_getter('symop') if sym_or_op: sym_or_op_lower = sym_or_op.lower() operator = OPERATORS.get(sym_or_op_lower) if operator: yield Token(start, end, sym_or_op, operator) else: sym = symbols.get(sym_or_op_lower) if not sym: sym = LicenseSymbol(key=sym_or_op) yield Token(start, end, sym_or_op, sym)
def build_token_with_symbol(): """ Build and return a new Token from accumulated unmatched tokens or None. """ if not unmatched: return # strip trailing spaces trailing_spaces = [] while unmatched and not unmatched[-1].string.strip(): trailing_spaces.append(unmatched.pop()) if unmatched: string = ' '.join(t.string for t in unmatched if t.string.strip()) start = unmatched[0].start end = unmatched[-1].end toksym = LicenseSymbol(string) unmatched.clear() yield Token(start, end, string, toksym) for ts in trailing_spaces: yield ts
def test_iter_simple(self): t = Trie() t.add('AND', 'AND') t.add('OR', 'OR') t.add('WITH', 'WITH') t.add('(', '(') t.add(')', ')') t.add('GPL-2.0', 'GPL-2.0') t.add('mit', 'MIT') t.add('Classpath', 'Classpath') t.make_automaton() test_string = '(GPL-2.0 with Classpath) or (gpl-2.0) and (classpath or gpl-2.0 OR mit) ' # 111111111122222222223333333333444444444455555555556666666666777 # 0123456789012345678901234567890123456789012345678901234567890123456789012 result = list(t.iter(test_string)) expected = [ Token(0, 0, u'(', u'('), Token(1, 7, u'GPL-2.0', u'GPL-2.0'), Token(9, 12, u'with', u'WITH'), Token(14, 22, u'Classpath', u'Classpath'), Token(23, 23, u')', u')'), Token(25, 26, u'or', u'OR'), Token(28, 28, u'(', u'('), Token(29, 35, u'gpl-2.0', u'GPL-2.0'), Token(36, 36, u')', u')'), Token(38, 40, u'and', u'AND'), Token(42, 42, u'(', u'('), Token(43, 51, u'classpath', u'Classpath'), Token(53, 54, u'or', u'OR'), Token(57, 63, u'gpl-2.0', u'GPL-2.0'), Token(65, 66, u'OR', u'OR'), Token(68, 70, u'mit', u'MIT'), Token(71, 71, u')', u')') ] assert expected == result
def test_tokenize_with_unmatched_and_space(self): def get_test_automaton(): words = '( AND ) OR'.split() t = Trie() for w in words: t.add(w, w) t.make_automaton() return t test_string = '((l-a + AND l-b) OR an (l -c+))' # 111111111122222222223 # 0123456789012345678901234567890 t = get_test_automaton() result = list( t.tokenize(test_string, include_unmatched=True, include_space=True)) expected = [ Token(0, 0, u'(', u'('), Token(1, 1, u'(', u'('), Token(2, 4, u'l-a', None), Token(5, 5, u' ', None), Token(6, 6, u'+', None), Token(7, 7, u' ', None), Token(8, 10, u'AND', u'AND'), Token(11, 11, u' ', None), Token(12, 14, u'l-b', None), Token(15, 15, u')', u')'), Token(16, 16, u' ', None), Token(17, 18, u'OR', u'OR'), Token(19, 19, u' ', None), Token(20, 21, u'an', None), Token(22, 22, u' ', None), Token(23, 23, u'(', u'('), Token(24, 24, u'l', None), Token(25, 25, u' ', None), Token(26, 28, u'-c+', None), Token(29, 29, u')', u')'), Token(30, 30, u')', u')') ] assert expected == result assert test_string == ''.join(t.string for t in result)
def test_iter_vs_tokenize(self): def get_test_automaton(): words = '( AND ) OR'.split() t = Trie() for w in words: t.add(w, w) t.make_automaton() return t test_string = '((l-a + AND l-b) OR (l -c+))' t = get_test_automaton() result = list( t.iter(test_string, include_unmatched=True, include_space=True)) expected = [ Token(0, 0, u'(', u'('), Token(1, 1, u'(', u'('), Token(2, 4, u'l-a', None), Token(5, 5, u' ', None), Token(6, 6, u'+', None), Token(7, 7, u' ', None), Token(8, 10, u'AND', u'AND'), Token(11, 11, u' ', None), Token(12, 14, u'l-b', None), Token(15, 15, u')', u')'), Token(16, 16, u' ', None), Token(17, 18, u'OR', u'OR'), Token(19, 19, u' ', None), Token(20, 20, u'(', u'('), Token(21, 21, u'l', None), Token(22, 22, u' ', None), Token(23, 25, u'-c+', None), Token(26, 26, u')', u')'), Token(27, 27, u')', u')') ] assert expected == result result = list( t.tokenize(test_string, include_unmatched=True, include_space=True)) assert expected == result
def test_iter_should_can_return_non_matches_optionally(self): def get_test_automaton(): words = 'he her hers his she hi him man himan'.split() t = Trie() for w in words: t.add(w, w) t.make_automaton() return t test_string = ' he she junk himan other stuffs ' # 111111111122222222223333333 # 0123456789012345678901234567890123456 t = get_test_automaton() result = list( t.iter(test_string, include_unmatched=True, include_space=True)) expected = [ Token(0, 1, u' ', None), Token(2, 3, u'he', u'he'), Token(4, 4, u' ', None), Token(5, 7, u'she', u'she'), Token(8, 8, u' ', None), Token(9, 12, u'junk', None), Token(13, 14, u' ', None), Token(15, 19, u'himan', u'himan'), Token(20, 21, u' ', None), Token(22, 26, u'other', None), Token(27, 27, u' ', None), Token(28, 33, u'stuffs', None), Token(34, 36, u' ', None), ] assert expected == result
def replace_with_subexpression_by_license_symbol(tokens, strict=False): """ Given an iterable of Token, yiled token, replacing any XXX WITH ZZZ subexpression by a LicenseWithExceptionSymbol symbol. Check validity of with subexpessions and raise ParseError as needed. If `strict` is True also raise ParseError if the left hand side LicenseSymbol has is_exception True or if the right hand side LicenseSymbol has is_exception False. """ token_groups = build_token_groups_for_with_subexpression(tokens) for token_group in token_groups: len_group = len(token_group) if not len_group: # This should never happen continue if len_group == 1: # a single token token = token_group[0] tval = token.value if isinstance(tval, Keyword): if tval.type == TOKEN_WITH: # keyword # a single group cannot be a single 'WITH' keyword: # this is an error that we catch and raise here. raise ParseError(token_type=TOKEN_WITH, token_string=token.string, position=token.start, error_code=PARSE_INVALID_EXPRESSION) elif isinstance(tval, LicenseSymbol): if strict and tval.is_exception: raise ParseError(token_type=TOKEN_SYMBOL, token_string=token.string, position=token.start, error_code=PARSE_INVALID_EXCEPTION) else: # this should not be possible by design raise Exception( 'Licensing.tokenize is internally confused...:' + repr(tval)) yield token continue if len_group != 3: # this should never happen string = ' '.join([tok.string for tok in token_group]) start = token_group[0].start raise ParseError(TOKEN_SYMBOL, string, start, PARSE_INVALID_EXPRESSION) # from now on we have a tripple of tokens: a WITH sub-expression such as "A with # B" seq of three tokens lic_token, WITH, exc_token = token_group token_string = ' '.join( [lic_token.string, WITH.string.strip(), exc_token.string]) # the left hand side license symbol lic_sym = lic_token.value # this should not happen if not isinstance(lic_sym, LicenseSymbol): raise ParseError(TOKEN_SYMBOL, lic_token.string, lic_token.start, PARSE_INVALID_SYMBOL) if strict and lic_sym.is_exception: raise ParseError(TOKEN_SYMBOL, lic_token.string, lic_token.start, PARSE_INVALID_EXCEPTION) # the right hand side exception symbol exc_sym = exc_token.value if not isinstance(exc_sym, LicenseSymbol): raise ParseError(TOKEN_SYMBOL, lic_sym.string, lic_sym.start, PARSE_INVALID_SYMBOL) if strict and not exc_sym.is_exception: raise ParseError(TOKEN_SYMBOL, exc_token.string, exc_token.start, PARSE_INVALID_SYMBOL_AS_EXCEPTION) lic_exc_sym = LicenseWithExceptionSymbol(lic_sym, exc_sym, strict) token = Token( lic_token.start, exc_token.end, token_string, lic_exc_sym, ) yield token