def expand_surface_forms(self, component, dictionary_types=None, normalize_numex=True, convert_to_ascii=False, remove_accents=True): dictionary_types = set([d.classification for d in (dictionary_types or self.all_gazetteers)] + [dictionaries.ANY]) if not component.strip(): return [] component = clean(component, to_ascii=convert_to_ascii, remove_accents=remove_accents) tokens = tokenize(component) if normalize_numex: tokens = convert_numeric_expressions(tokens) norm_tokens = [norm_token(c,t) for c, t in tokens] norm_token_map = [] i = 0 for i, t in enumerate(norm_tokens): norm_token_map.extend([i]*len(t)) phrase_tokens = [] possible_expansions = [] i = 0 for c, t, data in self.filter(list(chain(*norm_tokens))): if c == token_types.PHRASE: valid_expansions = set([(dictionaries.registry[val], canonical) for canonical, filename, val in data if val in dictionary_types]) len_t = len(t) if valid_expansions: phrase_tokens.extend([[(bilou_encoding(j, len_t), classification, tok[-1], canonical) for classification, canonical in valid_expansions] for j, tok in enumerate(t)]) else: phrase_tokens.extend([[(OUT, tok[0], tok[-1], None)] for tok in t]) else: phrase_tokens.append([(OUT, c, t[-1], None)]) possible_expansions.append(phrase_tokens) single_tokens = [] any_differing_tokens = False skip_until = 0 for i, (ti, p) in enumerate(zip(norm_token_map, phrase_tokens)): if i < skip_until: continue norm = norm_tokens[ti] token_class, token = tokens[ti] token_extensions = p[:] if len(norm) == 1 else [] scalar_tokens = scalar_transform(token_class, token, norm) if not scalar_tokens: pass elif len(scalar_tokens) > 1 or (len(scalar_tokens) < len(norm)): token_extensions.extend([(OUT, c, t, None) for c, t in scalar_tokens]) skip_until = i + len(scalar_tokens) + 1 any_differing_tokens = True elif scalar_tokens != tokens: token_extensions.append((OUT, scalar_tokens[0][0], scalar_tokens[0][1], None)) any_differing_tokens = True single_tokens.append(token_extensions) if any_differing_tokens: possible_expansions.append(single_tokens) return possible_expansions
def norm_token(token_class, token): if token_class in (token_types.WORD, token_types.ABBREVIATION, token_types.NUMERIC): translated = token.translate(word_token_replacements).lower() if translated == token: word_tokens = [(token_class, token)] else: word_tokens = tokenize(translated) return word_tokens else: return [(token_class, token)]