Python Tokenizer示例，jieba.Tokenizer Python示例

示例#1

0

显示文件

文件： chinese.py 项目： yaskapp/wordfreq

def jieba_tokenize(text, external_wordlist=False):
    """
    Tokenize the given text into tokens whose word frequencies can probably
    be looked up. This uses Jieba, a word-frequency-based tokenizer.

    If `external_wordlist` is False, we tell Jieba to default to using
    wordfreq's own Chinese wordlist, and not to infer unknown words using a
    hidden Markov model. This ensures that the multi-character tokens that it
    outputs will be ones whose word frequencies we can look up.

    If `external_wordlist` is True, this will use the largest version of
    Jieba's original dictionary, with HMM enabled, so its results will be
    independent of the data in wordfreq. These results will be better optimized
    for purposes that aren't looking up word frequencies, such as general-
    purpose tokenization, or collecting word frequencies in the first place.
    """
    global jieba_tokenizer, jieba_orig_tokenizer
    if external_wordlist:
        if jieba_orig_tokenizer is None:
            jieba_orig_tokenizer = jieba.Tokenizer(dictionary=ORIG_DICT_FILENAME)
        return jieba_orig_tokenizer.lcut(text)
    else:
        if jieba_tokenizer is None:
            jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)

        # Tokenize the Simplified Chinese version of the text, but return
        # those spans from the original text, even if it's in Traditional
        # Chinese
        tokens = []
        for _token, start, end in jieba_tokenizer.tokenize(simplify_chinese(text), HMM=False):
            tokens.append(text[start:end])
        return tokens

示例#2

0

显示文件

文件： example.py 项目： sjx0451/QizNLP

    def __init__(self, ckpt_name=None, pbmodel_dir=None):
        assert ckpt_name or pbmodel_dir, 'ues at least one way'
        self.graph = tf.Graph()
        self.config = tf.ConfigProto(
            allow_soft_placement=True,
            gpu_options=tf.GPUOptions(allow_growth=True),
        )
        self.sess = tf.Session(config=self.config, graph=self.graph)

        self.token2id_dct = {
            'word2id':
            utils.Any2Id.from_file(
                f'{curr_dir}/../data/toutiao_cls_word2id.dct',
                use_line_no=True),
            'label2id':
            utils.Any2Id.from_file(
                f'{curr_dir}/../data/toutiao_cls_label2id.dct',
                use_line_no=True),
        }
        self.jieba = jieba.Tokenizer()
        self.tokenize = lambda t: self.jieba.lcut(re.sub(r'\s+', '，', t))
        self.cut = lambda t: ' '.join(self.tokenize(t))
        if ckpt_name:
            self.load_from_ckpt_meta(ckpt_name)
        else:
            self.load_from_pbmodel(pbmodel_dir)

        self.id2label = self.token2id_dct['label2id'].get_reverse()

示例#3

0

显示文件

    def __setstate__(self, state):
        import jieba

        self.vocabulary = state or []
        self.tokenizer = jieba.Tokenizer()
        for word in self.vocabulary:
            self.tokenizer.add_word(word)

示例#4

0

显示文件

 def __init__(self, doc_path, true_path, predict_path):
     self.dir_path = os.path.dirname(os.path.realpath(__file__))
     self.doc_path = doc_path
     self.true_path = true_path
     self.predict_path = predict_path
     self.label_list = ['疾病和诊断', '解剖部位', '影像检查', '实验室检验', '药物', '手术', '@']
     self._jieba = jieba.Tokenizer(dictionary=None)
     self._jieba.set_dictionary(
         os.path.join(self.dir_path, 'data/our_dict1.txt'))
     self._jieba.initialize()
     self._jieba_posseg = jieba.posseg.POSTokenizer(tokenizer=self._jieba)
     self.aca: ACA = ACA()
     type_list = ['疾病和诊断', '解剖部位', '影像检查', '实验室检验', '药物', '手术']
     self.term_list = []
     self.term_label_dict = dict()
     for typee in type_list:
         file_i = pd.read_csv(open(os.path.join(
             os.path.dirname(self.dir_path),
             'analysis/res/term_frequency/' + typee +
             '_term_frequency.csv'),
                                   encoding='utf-8-sig'),
                              header=0)
         self.term_list.extend(file_i['term'])
         for term_i in file_i['term']:
             self.term_label_dict[term_i] = typee
     self.aca.add_words(self.term_list)

示例#5

0

显示文件

文件： entity.py 项目： sucre111/cdata

    def __init__(self, entity_list):
        """
            [{"@id":"1","name":"张三"},{"@id":"2","name":"李四"}]
            all input text are assumed (or will be converted into) unicode
        """
        # init entity index
        self.entities = collections.defaultdict(list)
        entity_list_unicode = []
        for entity in entity_list:
            entity_list_unicode.append(any2unicode(entity))

        for entity in entity_list_unicode:
            name = entity["name"]
            self.entities[name].append(entity)

        for entity in entity_list_unicode:
            for name in entity.get("alternateName", []):
                self.entities[name].append(entity)

        stat(entity_list_unicode, ["name"])

        # init jieba
        self.tokenizer = jieba.Tokenizer()
        for name in self.entities:
            self.tokenizer.add_word(name)

示例#6

0

显示文件

def init_jieba_dict(word_tuple, reload = False):
    if reload :
        remove_jieba_cache()
    jb = jieba.Tokenizer(dictionary=jieba.DEFAULT_DICT)
    for tu in word_tuple:
        jb.add_word(tu[0], freq=tu[1])
    return jb

示例#7

0

显示文件

def get_freq_general(lang='zh'):
    if lang != 'zh':
        print('get_freq_general not implemented for {}'.format(lang))
        return dd(int)
    t = jieba.Tokenizer()
    d = t.gen_pfdict(t.get_dict_file())[0]
    return dd(int, d)

示例#8

0

显示文件

 def __init__(self,
              pretrained_path,
              raw_data_path,
              output_dir,
              max_len=512,
              is_test=True,
              test_data_path=None,
              split_dic=None,
              voc_type_path=None):
     super().__init__(pretrained_path,
                      raw_data_path,
                      output_dir,
                      max_len=max_len,
                      is_test=is_test,
                      test_data_path=test_data_path)
     self.bert_tokenizer.add_special_tokens(self.special_tokens)
     self.cut_tokenizer = None
     if split_dic is not None:
         self.cut_tokenizer = jieba.Tokenizer()
         self.cut_tokenizer.load_userdict(split_dic)
     self.voc_type = None
     self.disease_list = set()
     if voc_type_path is not None:
         with codecs.open(voc_type_path, encoding='utf-8') as f:
             self.voc_type = json.load(f, encoding='utf-8')
         for v, types in self.voc_type.items():
             if "疾病" in types and len(v) >= 1:
                 self.disease_list.add(v)

示例#9

0

显示文件

    def __init__(self, dict_path: str = None):
        import jieba
        self._jieba = jieba.Tokenizer()
        self._jieba.cache_file = "gnes.jieba_wrapper.cache"

        if dict_path is not None:
            self._jieba.load_userdict(dict_path)

示例#10

0

显示文件

 def taste_dict(self):
     data = model.Taste.get_all()
     taste_jieba = jieba.Tokenizer()
     for food in data:
         taste_jieba.add_word(food['name'], 2000, food['type'])
     taste_pseg = pseg.POSTokenizer(taste_jieba)
     print('taste_pseg:success init')
     return taste_pseg

示例#11

0

显示文件

 def foods_dict(self):
     data = model.Foods.get_all()
     foods_jieba = jieba.Tokenizer()
     for food in data:
         foods_jieba.add_word(food['name'], 2000, food['type'])
     foods_pseg = pseg.POSTokenizer(foods_jieba)
     print('foods_pseg:success init')
     return foods_pseg

示例#12

0

显示文件

 def perform_word_segment(cls, corpus):
     """
     process word segmenting use jieba tokenizer
     """
     tokenizer = jieba.Tokenizer()
     corpus['tokens'] = corpus.content.apply(
         lambda x: list(tokenizer.cut(x)))
     return corpus

示例#13

0

显示文件

 def material_dict(self):
     data = model.Material.get_all()
     material_jieba = jieba.Tokenizer()
     for food in data:
         material_jieba.add_word(food['name'], 2000, food['parent_code'])
     material_pseg = pseg.POSTokenizer(material_jieba)
     print('material_pseg:success init')
     return material_pseg

示例#14

0

显示文件

    def __init__(self, show_reason=False, user_dict=None, stop_words=None):
        '''
        :param show_reason:是否需要展示原因
        :param user_dict:用户自定义词典，默认调用自带词典
        :param stop_words:自定义停顿词
        '''
        self.show_reason = show_reason
        self._user_dict_path = os.path.dirname(
            os.path.abspath(__file__)) + '/Data/'
        self._model_path = os.path.dirname(
            os.path.abspath(__file__)) + '/Data/'
        if self.show_reason:
            self.INIT_REASON = {
                0: "逻辑拼接",
                1: "命中敏感词",
                2: "疑似电话数字",
                3: "数字过长",
                4: "涉及微信号码敏感"
            }
        if user_dict:
            self._user_dict_path = user_dict
        self._jieba_phone_identification = jieba.Tokenizer(
            dictionary=self._user_dict_path + "user_dict.txt")
        self.rule = re.compile("[^\u4e00-\u9fa50-9.]")
        # feature12:手机号有相对固定的起始位
        '''
        联通现有号段是：130、131、132、155、156、186、185，其中3G专属号段是：186、185。还有无线上网卡专属号段：145

        移动现有号段是：134、135、136、137、138、139、150、151、152、157、158、159、182、183、188、187

        电信现有号段是：133、153、180、181、189
        '''
        # 14开头的多为上网卡，99.99%人不会用来打电话
        self._phone_start_position_number = ('13', '15', '17', '18')
        if stop_words:
            self.stop_words = stop_words
        else:
            # 无意义词
            self.stop_words = [
                '你', '我', '的', '啊', '嗯', '是', '吧', '对', '了', '那个', '那', '就',
                '好', '到', '给', '噢', '这个', '他', '说', '在', '不', '什么', '唉', '要',
                '也', '吗', '都', '现在', '一下', '这', '有', '就是', '不是', '呢', '好好',
                '能', '装', '看', '喂', '嘛', '知道', '你好', '可以', '没有', '多少', '多',
                '那边', '去', '没', '怎么', '常州', '哪里', '跟', '呀', '把', '我们', '的话',
                '货', '地方', '明天', '还', '行', '车', '不能', '问', '走', '等', '来', '给我',
                '这边', '再', '这样', '过去', '今天', '然后', '不知道', '上', '因为', '是不是',
                '得', '不了', '叫', '哦', '不要', '无锡', '上面', '反正', '南京', '讲', '搞',
                '还是', '过来', '看看', '拉', '应该', '东西', '它', '进去', '托盘', '车子', '还有',
                '可能', '又', '从', '哪', '时候', '拿', '啦', '肯定', '大概', '你们', '差不多',
                '写', '跑', '不行', '不到', '位置'
            ]
        self._tfidf_model = _readbunchobj(self._model_path +
                                          'train_data_tfidf_model.tfidf')
        self._model_0 = _readbunchobj(self._model_path +
                                      'train_data_mnb_tri_0.nb')
        self._quantile = _readbunchobj(self._model_path + 'quantile.dat')
        self._last_model = _readbunchobj(self._model_path + 'last_model.gbm')
        self._km_model = _readbunchobj(self._model_path + 'kmeans.m')

示例#15

0

显示文件

文件： text_tool.py 项目： BarryZM/KnowYouAI

 def __init__(self):
     self.token = jieba.Tokenizer()
     file = [
         x.path for x in os.scandir(config.JIEBA_DICT_PATH)
         if x.path.endswith("txt")
     ]
     for fp in file:
         self.token.load_userdict(fp)
     self.pos_token = POSTokenizer(self.token)

示例#16

0

显示文件

文件： __init__.py 项目： fish-xy/InformationExtraction

 def __init__(self, tokenizer=None):
     #dict_path = os.path.dirname(os.path.dirname(os.path.split(os.path.realpath(__file__))[0]))+"/people.dict"
     #print dict_path
     #jieba.re_han_default = re.compile("", re.U)
     #print 'loading...'
     #jieba.load_userdict(dict_path)
     #print 'dict load successfully'
     self.tokenizer = jieba.Tokenizer()
     self.load_word_tag(self.tokenizer.get_abs_path_dict())

示例#17

0

显示文件

    def __init__(self,
                 model_name,
                 tokenize=None,
                 pbmodel_dir=None,
                 use_hvd=False):
        # 维护sess graph config saver
        self.model_name = model_name
        if tokenize is None:
            self.jieba = jieba.Tokenizer()
            # self.jieba.load_userdict(f'{curr_dir}/../data/segword.dct')
            self.tokenize = lambda t: self.jieba.lcut(re.sub(r'\s+', '，', t))
        else:
            self.tokenize = tokenize
        self.cut = lambda t: ' '.join(self.tokenize(t))
        self.token2id_dct = {
            # 'word2id': utils.Any2Id.from_file(f'{curr_dir}/../data/mmch_word2id.dct', use_line_no=True),  # 自有数据
            # 'word2id': utils.Any2Id.from_file(f'{curr_dir}/../data/mmch_char2id.dct', use_line_no=True),  # 自有数据
            'word2id':
            utils.Any2Id.from_file(f'{curr_dir}/../data/DB_mmch_word2id.dct',
                                   use_line_no=True),  # 豆瓣多轮语料
            'char2id':
            utils.Any2Id.from_file(f'{curr_dir}/../data/DB_mmch_char2id.dct',
                                   use_line_no=True),  # 豆瓣多轮语料
        }
        self.config = tf.ConfigProto(
            allow_soft_placement=True,
            gpu_options=tf.GPUOptions(allow_growth=True),
        )
        self.use_hvd = use_hvd if HVD_ENABLE else False
        if self.use_hvd:
            hvd.init()
            self.hvd_rank = hvd.rank()
            self.hvd_size = hvd.size()
            self.config.gpu_options.visible_device_list = str(hvd.local_rank())
        self.graph = tf.Graph()
        self.sess = tf.Session(config=self.config, graph=self.graph)

        if pbmodel_dir is not None:  # 只能做predict
            self.model = MMCH_Model.from_pbmodel(pbmodel_dir, self.sess)
        else:
            with self.graph.as_default():
                self.model = MMCH_Model(model_name=self.model_name,
                                        run_model=self)
                if self.use_hvd:
                    self.model.optimizer._lr = self.model.optimizer._lr * self.hvd_size  # 分布式训练大batch增大学习率
                    self.model.hvd_optimizer = hvd.DistributedOptimizer(
                        self.model.optimizer)
                    self.model.train_op = self.model.hvd_optimizer.minimize(
                        self.model.loss, global_step=self.model.global_step)
                self.sess.run(tf.global_variables_initializer())
                if self.use_hvd:
                    self.sess.run(hvd.broadcast_global_variables(0))

        with self.graph.as_default():
            self.saver = tf.train.Saver(
                max_to_keep=100)  # must in the graph context

示例#18

0

显示文件

文件： tokenizer.py 项目： zwqll/class

    def __init__(self,
                 user_dict_path='',
                 entity_dict_path='',
                 stop_words_path='',
                 user_dict=(),
                 entity_dict=(),
                 stop_words=(),
                 use_single_char=False):
        """
        初始化分词器，用词典初始化
        :param user_dict_path: 用户词典路径
        :param entity_dict_path: 实体词典路径
        :param stop_words_path: 停用词路径
        :param user_dict: 用户词典集合
        :param entity_dict: 实体词典集合
        :param stop_words: 停用词集合
        """
        assert isinstance(user_dict_path, str)
        assert isinstance(entity_dict_path, str)
        assert isinstance(stop_words_path, str)
        assert isinstance(user_dict, tuple)
        assert isinstance(entity_dict, tuple)
        assert isinstance(stop_words, tuple)

        self.use_single_char = use_single_char
        # 初始化结巴分词器
        self.tokenizer = jieba.Tokenizer()
        try:
            if os.path.exists(user_dict_path):
                self.tokenizer.load_userdict(user_dict_path)
            if os.path.exists(entity_dict_path):
                self.tokenizer.load_userdict(entity_dict_path)
            for word in user_dict:
                self.tokenizer.add_word(word)
            for word in entity_dict:
                self.tokenizer.add_word(word)
        except Exception as e:
            print(e)
        self.pos_tokenizer = jieba.posseg.POSTokenizer(
            tokenizer=self.tokenizer)

        # 初始化停用词表
        self.stop_words = []
        try:
            if os.path.exists(stop_words_path):
                with open(stop_words_path, 'r', encoding='utf-8') as f:
                    for line in f.readlines():
                        word = line.replace('\r',
                                            '').replace('\n', '').replace(
                                                '\t', '').replace(' ', '')
                        self.stop_words.append(word)

            self.stop_words.extend(list(stop_words))
            self.stop_words = list(set(self.stop_words))
        except Exception as e:
            print(e)

示例#19

0

显示文件

文件： native_bayes_sentiment_analyzer.py 项目： zushasifang/ProjectsForChineseGraduates

 def __init__(self, model_path, userdict_path, stopword_path):
     self.clf = None
     self.vectorizer = None
     self.tfidftransformer = None
     self.model_path = model_path
     self.stopword_path = stopword_path
     self.userdict_path = userdict_path
     self.stop_words = []
     self.tokenizer = jieba.Tokenizer()
     self.initialize()

示例#20

0

显示文件

 def technics_dict(self):
     data = model.Technics.get_all()
     technics_jieba = jieba.Tokenizer()
     for food in data:
         technics_jieba.del_word(food['name'])
         # technics_jieba.add_word('是',2000,'ttt')
         technics_jieba.add_word(food['name'], 2000, food['type'])
     technics_pseg = pseg.POSTokenizer(technics_jieba)
     print('technics_pseg:success init')
     return technics_pseg

示例#21

0

显示文件

文件： bayes_analyzer.py 项目： Dicer-Zz/SentimentAnalysis

 def __init__(self, modelPath, stopwordPath, userDictPath):
     self.clt = None
     self.vectorizer = None
     self.tfidftransformer = None
     self.modelPath = modelPath
     self.stopwordPath = stopwordPath
     self.userDictPath = userDictPath
     self.stopWords = []
     self.tokenizer = jieba.Tokenizer()
     self.initalize()

示例#22

0

显示文件

文件： tokenizer.py 项目： lzy3240/models

 def __init__(self, vocab):
     super(JiebaTokenizer, self).__init__(vocab)
     self.tokenizer = jieba.Tokenizer()
     # initialize tokenizer
     self.tokenizer.FREQ = {
         key: 1
         for key in self.vocab.token_to_idx.keys()
     }
     self.tokenizer.total = len(self.tokenizer.FREQ)
     self.tokenizer.initialized = True

示例#23

0

显示文件

 def __init__(self, doc_path, true_path, predict_path):
     self.dir_path = os.path.dirname(os.path.realpath(__file__))
     self.doc_path = doc_path
     self.label_list = ['疾病和诊断', '解剖部位', '影像检查', '实验室检验', '药物', '手术', ' ']
     self._jieba = jieba.Tokenizer(dictionary=None)
     self._jieba.set_dictionary(
         os.path.join(self.dir_path, 'data/our_dict1.txt'))
     self._jieba.initialize()
     self._jieba_posseg = jieba.posseg.POSTokenizer(tokenizer=self._jieba)
     self.true_path = true_path
     self.predict_path = predict_path

示例#24

0

显示文件

文件： tf_idf.py 项目： hellomaxwell/keyword-extractor

 def __init__(self, ei_file):
     self.ei_file = ei_file
     self.df_file = 'tf_counter.json'
     if os.path.exists(self.df_file):
         print('df_file exists, loading the df_file ...')
         self.df = json.load(open(self.df_file))
     else:
         self.df = Counter()
         self._read_ei_file()
     self.jieba_tokenizer = jieba.Tokenizer()
     self.jieba_tokenizer.tmp_dir = '.'

示例#25

0

显示文件

文件： common.py 项目： sshimii/TextPair

    def _load_user_dict(self, user_dict_path):
        tokenizer = jieba.Tokenizer()
        if user_dict_path is None:
            return tokenizer

        if os.path.isfile(user_dict_path):
            tokenizer.load_userdict(user_dict_path)
        elif os.path.isdir(user_dict_path):
            for fn in os.listdir(user_dict_path):
                fp = os.path.join(user_dict_path, fn)
                if os.path.isfile(fp):
                    tokenizer.load_userdict(fp)
        return tokenizer

示例#26

0

显示文件

    def __init__(self):
        """Constructor."""
        super().__init__()

        # Text -> sentence tokenizer for Chinese text
        self.__chinese_sentence_tokenizer = RegexpTokenizer(
            r'([^！？。]*[！？。])',
            gaps=True,  # don't discard non-Chinese text
            discard_empty=True,
        )

        self.__english_language = EnglishLanguage()

        self.__jieba = jieba.Tokenizer()
        self.__jieba.cache_file = self.__CACHE_PATH

        if not os.path.isdir(self.__DICT_PATH):
            raise McLanguageException(
                "Jieba dictionary directory was not found: %s" %
                self.__DICT_PATH)

        if not os.path.isfile(self.__JIEBA_DICT_PATH):
            raise McLanguageException(
                "Default dictionary not found in Jieba dictionary directory: %s"
                % self.__DICT_PATH)
        if not os.path.isfile(self.__JIEBA_USERDICT_PATH):
            raise McLanguageException(
                "User dictionary not found in Jieba dictionary directory: %s" %
                self.__DICT_PATH)
        try:
            self.__jieba.set_dictionary(os.path.join(self.__JIEBA_DICT_PATH))
            self.__jieba.load_userdict(os.path.join(
                self.__JIEBA_USERDICT_PATH))
        except Exception as ex:
            raise McLanguageException("Unable to initialize Jieba: %s" %
                                      str(ex))

        # Quick self-test to make sure that Jieba, its dictionaries and Python class are installed and working
        jieba_exc_message = "Jieba self-test failed; make sure that the dictionaries are accessible."
        try:
            test_words = self.split_sentence_to_words('python課程')
        except Exception as _:
            raise McLanguageException(jieba_exc_message)
        else:
            if len(test_words) < 2 or test_words[1] != '課程':
                raise McLanguageException(jieba_exc_message)

示例#27

0

显示文件

文件： aimodel_cherry.py 项目： wujindou/Text_Review

    def __init__(self, title):
        super(DefaultModelServer, self).__init__(title)

        # # 读取敏感词库
        # with open('ai/{}/fenlei_mingan'.format(title), 'rb') as f:
        #     self.mingan_dict = pickle.load(f)

        #读取jieba补充词库

        #jieba.load_userdict("ai/{}/jieba_buchong.txt".format(title))
        self.jieba_fnlp = jieba.Tokenizer()

        # 读取停止词库
        self.stopwords = pd.read_csv("ai/{}/stopwords.txt".format(title),
                                     index_col=False,
                                     quoting=3,
                                     sep="\t",
                                     names=['stopword'],
                                     encoding='utf-8').values

        # 加载fasttext模型
        self.ft_model = fasttext.load_model(
            'ai/{}/classifier.model.bin'.format(title),
            label_prefix='__label__')
        #self.ft_model = fasttext.load_model('ai/{}/classifier.model.bin'.format(title))
        #==============================
        #print('ai/{}/classifier.model.bin'.format(title))
        cp = configparser.ConfigParser()
        cp.read('./ai/{}/labels.ini'.format(title), encoding='utf-8')
        kvs = cp.items("labels")
        kvs_cn = cp.items('labels_cn')

        #self.label_to_cate = {3: 'violation_politics', 2: 'normal_politics', 1: 'normal'}
        self.kind_book = []
        self.kind_book_cn = []

        for kv in kvs:
            self.kind_book.append(kv[1])

        for kv in kvs_cn:
            self.kind_book_cn.append(kv[1])

        self.ok = True
        self.title = title

示例#28

0

显示文件

文件： jieba_processor.py 项目： Miopas/serving

    def __init__(self, normalizer='basic_with_num', term_file=None):
        '''
        初始化分词器，初始化字符串处理器，从term_file中加载同义词词典
        Args:
            term_file:自定义词典
        Returns:
            None
        '''
        #加载字符串预处理器和同义词典
        self.normalizer = normalizer
        self.tokenizer = jieba.Tokenizer()
        self.synonym = {}
        if term_file is not None:
            df = pd.read_csv(term_file)
            for i, row in df.iterrows():
                word = unicode(str(row['word']), 'utf8')
                if self.normalizer == 'basic':
                    word = normalize(word)
                elif self.normalizer == 'basic_with_num':
                    word = normalize_with_num(word)
                else:
                    pass
                if len(word) == 0:
                    continue
                self.tokenizer.add_word(word)

                #替换同义词
                if row['synonym'] is not None:
                    synonym = unicode(str(row['synonym']), 'utf8')

                    if self.normalizer == 'basic':
                        synonym = normalize(synonym)
                    elif self.normalizer == 'basic_with_num':
                        synonym = normalize_with_num(synonym)
                    else:
                        pass

                    if len(synonym) == 0:
                        continue
                    self.tokenizer.add_word(synonym)

                    if word != synonym:
                        self.synonym[synonym] = word
        LOGGER.debug('init JiebaProcessor success')

示例#29

0

显示文件

    def manage_word(self,todo,word,kindnum):
        ret = '1'

        if todo=='increase':
            self.increase(word,kindnum)
            #return '已将"'+word+'"增加为“'+self.kind_book[int(kindnum)]+'”类别'
            return ret

        elif todo =='change':
            if word in self.mingan_dict:
                self.change(word,kindnum)
                #return '已将"'+word+'"更改为“'+self.kind_book[int(kindnum)]+'”类别'
                return ret
            else:
                return '-1'

        elif todo =='delete':
            if word in self.mingan_dict:
                self.delete(word)

            else:
                return '-1'

        elif todo == 'search':
            if word in self.mingan_dict:
                return self.search(word)
            else:
                return '-1'

        elif todo == 'notice':
            if word == '1':

                with open('/data/ai_g7/sensitive_words', 'rb') as f:
                    self.mingan_dict = pickle.load(f)

                self.jieba_kw = jieba.Tokenizer(dictionary="/data/ai_g7/jieba_kwdict.txt")
                print ('update successful')
                return '1'
            else:
                return '-1'
        else:

            return '-2'

示例#30

0

显示文件

    def perform_word_segment(cls, corpus):
        """
        process word segmenting use jieba tokenizer
        """

        jieba.suggest_freq('小艾', True)
        jieba.suggest_freq('艾佳', True)
        jieba.suggest_freq('艾佳家居', True)
        jieba.suggest_freq('米兰星光', True)
        jieba.suggest_freq('诗意新居', True)
        jieba.suggest_freq('雅君壹格', True)
        jieba.suggest_freq('以爱之名', True)
        jieba.suggest_freq('艾师傅', True)
        jieba.suggest_freq('地暖', True)

        tokenizer = jieba.Tokenizer()
        corpus['tokens'] = corpus.content.astype('str').apply(
            lambda x: list(tokenizer.cut(x)))
        return corpus