示例#1
0
文件: word2vec.py 项目: lei1993/HanLP
class GazetterTransform(object):
    def __init__(self, field, words: dict) -> None:
        super().__init__()
        self.field = field
        self.trie = Trie()
        for word, idx in words.items():
            self.trie[word] = idx

    def __call__(self, sample: dict) -> dict:
        tokens = sample[self.field]
        lexicons = self.trie.parse(tokens)
        skips_l2r = [[] for _ in range(len(tokens))]
        skips_r2l = [[] for _ in range(len(tokens))]
        for w, i, s, e in lexicons:
            e = e - 1
            skips_l2r[e].append((s, w, i))
            skips_r2l[s].append((e, w, i))
        for direction, value in zip(['skips_l2r', 'skips_r2l'],
                                    [skips_l2r, skips_r2l]):
            sample[f'{self.field}_{direction}_offset'] = [
                list(map(lambda x: x[0], p)) for p in value
            ]
            sample[f'{self.field}_{direction}_id'] = [
                list(map(lambda x: x[-1], p)) for p in value
            ]
            sample[f'{self.field}_{direction}_count'] = list(map(len, value))
        return sample
示例#2
0
文件: word2vec.py 项目: lei1993/HanLP
 def tokenizer(self):
     if not self._tokenizer:
         if HANLP_VERBOSE:
             flash(
                 'Building Trie-based tokenizer for Doc2Vec [blink][yellow]...[/yellow][/blink]'
             )
         self._tokenizer = Trie(self.vocabs['token'].token_to_idx)
         if HANLP_VERBOSE:
             flash('')
     return self._tokenizer
示例#3
0
def split_sents(text: str, trie: Trie):
    words = trie.parse_longest(text)
    sents = []
    pre_start = 0
    offsets = []
    for start, end, value in words:
        if pre_start != start:
            sents.append(text[pre_start:start])
            offsets.append(pre_start)
        pre_start = end
    if pre_start != len(text):
        sents.append(text[pre_start:])
        offsets.append(pre_start)
    return sents, offsets, words
示例#4
0
 def __init__(self, field, words: dict) -> None:
     super().__init__()
     self.field = field
     self.trie = Trie()
     for word, idx in words.items():
         self.trie[word] = idx
示例#5
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 21:25
from hanlp_trie.trie import Trie

import hanlp

tokenizer = hanlp.load('LARGE_ALBERT_BASE')
text = 'NLP统计模型没有加规则,聪明人知道自己加。英文、数字、自定义词典统统都是规则。'
print(tokenizer(text))

trie = Trie()
trie.update({'自定义词典': 'custom_dict', '聪明人': 'smart'})


def split_sents(text: str, trie: Trie):
    words = trie.parse_longest(text)
    sents = []
    pre_start = 0
    offsets = []
    for start, end, value in words:
        if pre_start != start:
            sents.append(text[pre_start:start])
            offsets.append(pre_start)
        pre_start = end
    if pre_start != len(text):
        sents.append(text[pre_start:])
        offsets.append(pre_start)
    return sents, offsets, words