def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS, token_options=DEFAULT_TOKEN_OPTIONS, strip_parentheticals=True): ''' Normalizes a string, tokenizes, and normalizes each token with string and token-level options. This version only uses libpostal's deterministic normalizations i.e. methods with a single output. The string tree version will return multiple normalized strings, each with tokens. Usage: normalized_tokens(u'St.-Barthélemy') ''' s = safe_decode(s) if string_options & _normalize.NORMALIZE_STRING_LATIN_ASCII: normalized = _normalize.normalize_string_latin(s, string_options) else: normalized = _normalize.normalize_string_utf8(s, string_options) # Tuples of (offset, len, type) raw_tokens = tokenize_raw(normalized) tokens = [(_normalize.normalize_token(normalized, t, token_options), token_types.from_id(t[-1])) for t in raw_tokens] if strip_parentheticals: return remove_parens(tokens) else: return tokens
def parse_address(address, language=None, country=None): ''' @param address: the address as either Unicode or a UTF-8 encoded string @param language (optional): language code @param country (optional): country code ''' address = safe_decode(address, 'utf-8') return _parser.parse_address(address, language=language, country=country)
def expand_address(address, languages=DEFAULT_LANGUAGES, **kw): ''' @param address: the address as either Unicode or a UTF-8 encoded string @param languages: a tuple or list of ISO language code strings (e.g. "en", "fr", "de", etc.) to use in expansion. Default is English. Until automatic language classification is ready in libpostal, this parameter is required. ''' address = safe_decode(address, 'utf-8') return _expand.expand_address(address, languages=languages, **kw)
def tokenize(s): u = safe_decode(s) s = safe_encode(s) return [(safe_decode(s[start:start + length]), token_types.from_id(token_type)) for start, length, token_type in _tokenize.tokenize(u)]
def tokenize_raw(s): return _tokenize.tokenize(safe_decode(s))