def load_stopwords(fin):
    ws = []
    with open(fin, 'r') as f:
        for line in f:
            w = line.strip('\n')
            # 转为unicode,这个是jieba的输出标准str(unicode)
            w = jieba.strdecode(w)
            ws.append(w)
    w = '\n'
    w = jieba.strdecode(w)
    ws.append(w)
    return set(ws)
Пример #2
0
def gen_doc_embedding(text):
    global g_tokenizer, g_model
    """
    :param text:
    :return: 向量 [float] 注意它不能保证是float32 因为float32在python中是不存在的
        当前实际值是 numpy.array(dtype=float32)
    """
    text = jieba.strdecode(text)
    token_ids, segment_ids = g_tokenizer.encode(text)
    vec = g_model.predict([np.array([token_ids]), np.array([segment_ids])], batch_size=1)
    vec = vec[0][-2]  # 用-2层的隐层作为向量
    vec = vec / linalg.norm(vec)
    return vec
def _text_to_words(text):
    ws = []
    ss = re.split(r'\s+', text)
    for s in ss:
        # print(s)
        s = jieba.strdecode(s)  # 统一的转为unicode编码
        if _is_chinese_chars(s):
            words = jieba.lcut(s)
            ws.extend(words)
        else:
            ws.append(s)
    ws = list(filter(lambda w: w not in stopwords, ws))
    ws = [w for w in ws if w not in stopwords]
    return ws
Пример #4
0
def cut(self, sentence, cut_all=False, HMM=True):
    """The new cut function to replace `jieba.cut` function.

    The details can be seen at https://yuque.alibaba-inc.com/ivr_algo/business/mc1fdi#6089454d.
    """
    sentence = jieba.strdecode(sentence)

    re_eng_word = jieba.re_eng_word
    if cut_all:
        re_han = jieba.re_han_cut_all
        re_skip = jieba.re_skip_cut_all
    else:
        re_han = jieba.re_han_default
        re_skip = jieba.re_skip_default
    if cut_all:
        cut_block = self._Tokenizer__cut_all
    elif HMM:
        cut_block = self._Tokenizer__cut_DAG
    else:
        cut_block = self._Tokenizer__cut_DAG_NO_HMM
    blocks = re_han.split(sentence)
    for blk in blocks:
        if not blk.strip():
            continue
        if re_eng_word.match(blk):
            yield blk
        elif re_han.match(blk):
            for word in cut_block(blk):
                yield word
        else:
            tmp = re_skip.split(blk)
            for x in tmp:
                if re_skip.match(x):
                    yield x
                elif not cut_all:
                    for xx in x:
                        yield xx
                else:
                    yield x