class GazetterTransform(object): def __init__(self, field, words: dict) -> None: super().__init__() self.field = field self.trie = Trie() for word, idx in words.items(): self.trie[word] = idx def __call__(self, sample: dict) -> dict: tokens = sample[self.field] lexicons = self.trie.parse(tokens) skips_l2r = [[] for _ in range(len(tokens))] skips_r2l = [[] for _ in range(len(tokens))] for w, i, s, e in lexicons: e = e - 1 skips_l2r[e].append((s, w, i)) skips_r2l[s].append((e, w, i)) for direction, value in zip(['skips_l2r', 'skips_r2l'], [skips_l2r, skips_r2l]): sample[f'{self.field}_{direction}_offset'] = [ list(map(lambda x: x[0], p)) for p in value ] sample[f'{self.field}_{direction}_id'] = [ list(map(lambda x: x[-1], p)) for p in value ] sample[f'{self.field}_{direction}_count'] = list(map(len, value)) return sample
def tokenizer(self): if not self._tokenizer: if HANLP_VERBOSE: flash( 'Building Trie-based tokenizer for Doc2Vec [blink][yellow]...[/yellow][/blink]' ) self._tokenizer = Trie(self.vocabs['token'].token_to_idx) if HANLP_VERBOSE: flash('') return self._tokenizer
def split_sents(text: str, trie: Trie): words = trie.parse_longest(text) sents = [] pre_start = 0 offsets = [] for start, end, value in words: if pre_start != start: sents.append(text[pre_start:start]) offsets.append(pre_start) pre_start = end if pre_start != len(text): sents.append(text[pre_start:]) offsets.append(pre_start) return sents, offsets, words
def __init__(self, field, words: dict) -> None: super().__init__() self.field = field self.trie = Trie() for word, idx in words.items(): self.trie[word] = idx
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 21:25 from hanlp_trie.trie import Trie import hanlp tokenizer = hanlp.load('LARGE_ALBERT_BASE') text = 'NLP统计模型没有加规则,聪明人知道自己加。英文、数字、自定义词典统统都是规则。' print(tokenizer(text)) trie = Trie() trie.update({'自定义词典': 'custom_dict', '聪明人': 'smart'}) def split_sents(text: str, trie: Trie): words = trie.parse_longest(text) sents = [] pre_start = 0 offsets = [] for start, end, value in words: if pre_start != start: sents.append(text[pre_start:start]) offsets.append(pre_start) pre_start = end if pre_start != len(text): sents.append(text[pre_start:]) offsets.append(pre_start) return sents, offsets, words