Пример #1
0
    def expand_surface_forms(self, component, dictionary_types=None, normalize_numex=True, convert_to_ascii=False, remove_accents=True):
        dictionary_types = set([d.classification for d in (dictionary_types or self.all_gazetteers)] + [dictionaries.ANY])
        if not component.strip():
            return []
        component = clean(component, to_ascii=convert_to_ascii, remove_accents=remove_accents)
        tokens = tokenize(component)

        if normalize_numex:
            tokens = convert_numeric_expressions(tokens)

        norm_tokens = [norm_token(c,t) for c, t in tokens]
        norm_token_map = []
        i = 0
        for i, t in enumerate(norm_tokens):
            norm_token_map.extend([i]*len(t))

        phrase_tokens = []

        possible_expansions = []
        i = 0
        for c, t, data in self.filter(list(chain(*norm_tokens))):
            if c == token_types.PHRASE:
                valid_expansions = set([(dictionaries.registry[val], canonical) for canonical, filename, val in data if val in dictionary_types])
                len_t = len(t)
                if valid_expansions:
                    phrase_tokens.extend([[(bilou_encoding(j, len_t), classification, tok[-1], canonical) for classification, canonical in valid_expansions] for j, tok in enumerate(t)])
                else:
                    phrase_tokens.extend([[(OUT, tok[0], tok[-1], None)] for tok in t])
            else:
                phrase_tokens.append([(OUT, c, t[-1], None)])

        possible_expansions.append(phrase_tokens)

        single_tokens = []
        any_differing_tokens = False
        skip_until = 0
        for i, (ti, p) in enumerate(zip(norm_token_map, phrase_tokens)):
            if i < skip_until:
                continue
            norm = norm_tokens[ti]
            token_class, token = tokens[ti]
            token_extensions = p[:] if len(norm) == 1 else []
            scalar_tokens = scalar_transform(token_class, token, norm)
            if not scalar_tokens:
                pass
            elif len(scalar_tokens) > 1 or (len(scalar_tokens) < len(norm)):
                token_extensions.extend([(OUT, c, t, None) for c, t in scalar_tokens])
                skip_until = i + len(scalar_tokens) + 1
                any_differing_tokens = True
            elif scalar_tokens != tokens:
                token_extensions.append((OUT, scalar_tokens[0][0], scalar_tokens[0][1], None))
                any_differing_tokens = True
            single_tokens.append(token_extensions)

        if any_differing_tokens:
           possible_expansions.append(single_tokens)


        return possible_expansions
Пример #2
0
def norm_token(token_class, token):
    if token_class in (token_types.WORD, token_types.ABBREVIATION, token_types.NUMERIC):
        translated = token.translate(word_token_replacements).lower()
        if translated == token:
            word_tokens = [(token_class, token)]
        else:
            word_tokens = tokenize(translated)

        return word_tokens
    else:
        return [(token_class, token)]