def load_stopwords(fin): ws = [] with open(fin, 'r') as f: for line in f: w = line.strip('\n') # 转为unicode,这个是jieba的输出标准str(unicode) w = jieba.strdecode(w) ws.append(w) w = '\n' w = jieba.strdecode(w) ws.append(w) return set(ws)
def gen_doc_embedding(text): global g_tokenizer, g_model """ :param text: :return: 向量 [float] 注意它不能保证是float32 因为float32在python中是不存在的 当前实际值是 numpy.array(dtype=float32) """ text = jieba.strdecode(text) token_ids, segment_ids = g_tokenizer.encode(text) vec = g_model.predict([np.array([token_ids]), np.array([segment_ids])], batch_size=1) vec = vec[0][-2] # 用-2层的隐层作为向量 vec = vec / linalg.norm(vec) return vec
def _text_to_words(text): ws = [] ss = re.split(r'\s+', text) for s in ss: # print(s) s = jieba.strdecode(s) # 统一的转为unicode编码 if _is_chinese_chars(s): words = jieba.lcut(s) ws.extend(words) else: ws.append(s) ws = list(filter(lambda w: w not in stopwords, ws)) ws = [w for w in ws if w not in stopwords] return ws
def cut(self, sentence, cut_all=False, HMM=True): """The new cut function to replace `jieba.cut` function. The details can be seen at https://yuque.alibaba-inc.com/ivr_algo/business/mc1fdi#6089454d. """ sentence = jieba.strdecode(sentence) re_eng_word = jieba.re_eng_word if cut_all: re_han = jieba.re_han_cut_all re_skip = jieba.re_skip_cut_all else: re_han = jieba.re_han_default re_skip = jieba.re_skip_default if cut_all: cut_block = self._Tokenizer__cut_all elif HMM: cut_block = self._Tokenizer__cut_DAG else: cut_block = self._Tokenizer__cut_DAG_NO_HMM blocks = re_han.split(sentence) for blk in blocks: if not blk.strip(): continue if re_eng_word.match(blk): yield blk elif re_han.match(blk): for word in cut_block(blk): yield word else: tmp = re_skip.split(blk) for x in tmp: if re_skip.match(x): yield x elif not cut_all: for xx in x: yield xx else: yield x