#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import unicode_literals from copy import deepcopy from itertools import chain import os from pypinyin.compat import text_type, callable_check from pypinyin.constants import (PHRASES_DICT, PINYIN_DICT, RE_HANS, TONE, NORMAL) from pypinyin.utils import simple_seg, _replace_tone2_style_dict_to_default from pypinyin.style import auto_discover, convert auto_discover() def seg(hans): if getattr(seg, 'no_jieba', None): ret = hans return simple_seg(ret) if seg.jieba is None: try: import jieba seg.jieba = jieba except ImportError: seg.no_jieba = True return seg(hans) else:
from __future__ import unicode_literals from copy import deepcopy from itertools import chain from pypinyin.compat import text_type, callable_check from pypinyin.constants import ( PHRASES_DICT, PINYIN_DICT, RE_HANS, Style ) from pypinyin.contrib import mmseg from pypinyin.utils import simple_seg, _replace_tone2_style_dict_to_default from pypinyin.style import auto_discover, convert as convert_style auto_discover() def seg(hans): hans = simple_seg(hans) ret = [] for x in hans: if not RE_HANS.match(x): # 没有拼音的字符,不再参与二次分词 ret.append(x) elif PHRASES_DICT: ret.extend(list(mmseg.seg.cut(x))) else: # 禁用了词语库,不分词 ret.append(x) return ret