def taste_dict(self): data = model.Taste.get_all() taste_jieba = jieba.Tokenizer() for food in data: taste_jieba.add_word(food['name'], 2000, food['type']) taste_pseg = pseg.POSTokenizer(taste_jieba) print('taste_pseg:success init') return taste_pseg
def foods_dict(self): data = model.Foods.get_all() foods_jieba = jieba.Tokenizer() for food in data: foods_jieba.add_word(food['name'], 2000, food['type']) foods_pseg = pseg.POSTokenizer(foods_jieba) print('foods_pseg:success init') return foods_pseg
def material_dict(self): data = model.Material.get_all() material_jieba = jieba.Tokenizer() for food in data: material_jieba.add_word(food['name'], 2000, food['parent_code']) material_pseg = pseg.POSTokenizer(material_jieba) print('material_pseg:success init') return material_pseg
def technics_dict(self): data = model.Technics.get_all() technics_jieba = jieba.Tokenizer() for food in data: technics_jieba.del_word(food['name']) # technics_jieba.add_word('是',2000,'ttt') technics_jieba.add_word(food['name'], 2000, food['type']) technics_pseg = pseg.POSTokenizer(technics_jieba) print('technics_pseg:success init') return technics_pseg
def parse_test(p_str): ''' call jieba module to parse the text with tags, and transform the respective tag to NLPI forms. input: text string output: results with tags ''' pseg.POSTokenizer(tokenizer=None) words = pseg.cut(p_str) ret_str = '' for word, flag in words: ret_str += word +'/' + flag + ' ' return ret_str.encode('GB18030')
def __init__(self, model='jieba'): self.model = model if model.lower() == 'jieba': import jieba.posseg as posseg posseg.initialize() self.segmentor = posseg.POSTokenizer(tokenizer=None) elif model.lower() == 'ictclas': import pynlpir pynlpir.open() self.segmentor = pynlpir else: raise NotImplementedError
import jieba.posseg as pseg words = pseg.cut("中国人民是不可战胜的") #词性标注. words1 = pseg.POSTokenizer(tokenizer=pseg.dt) for word, flag in words: print('%s %s' % (word, flag)) print(words1)
# -*- coding: UTF-8 -*- import sys import jieba.posseg as pseg import jieba import argparse t = pseg.POSTokenizer() jieba.initialize() def segment_nodict(string): """ 输入为要分词的句子,输出分词后和标注后的结果 :param string: the string that need to be segment :return: 分词后的结果,用一个列表表示,列表的每一个元素是一个元组,元组第一维表示分好后的词,第二维表示词性。 """ segment_words = t.cut(string) segment_list = [] for i in segment_words: segment_list.append((i.word, i.flag)) return segment_list if __name__ == '__main__': with open('jiebapos_union3.txt', 'r') as f: lines = f.readlines() str = '' for index, item in enumerate(lines): if item != '\n': word = item.split('\t')[0].strip() else:
import jieba from jieba import posseg from .cmd_tools import tika_convert, antiword_convert from . import _config from .data_helps import corpus_cut __all__ = ( "Files2Text", "CorpusClean", "CorpusHandler" ) jieba.setLogLevel(logging.INFO) jieba.enable_parallel() jieba_inst = posseg.POSTokenizer(jieba.Tokenizer()) class ReCache: N_RE = re.compile(r'\n') MORE_SPACE_T_RE = re.compile(r'[\s\t]+') CN_RE = re.compile(r'^[\u4e00-\u9fa5]+$') class BaseCorpus(object): def __init__(self, logger=None): if logger is None: logging.basicConfig(level=logging.DEBUG, format=_config.LOG_FORMAT, datefmt=_config.LOG_DATE_FORMAT)
def __init__(self, dicts=list()): self.jieba = jieba # 词典自定义添加了特定词、用户词典、数字词典 self.jieba.add_word("成龙", 1000, 'movie_person') self.jieba.add_word("快进", 1000) for k in ['#UNK', '#PAD', '#BOS', '#EOS']: self.jieba.add_word(k, 1000) self.jieba.add_word("快进", 1000) self.jieba.add_word("快进", 1000) self.jieba.add_word("快进", 1000) self.jieba.add_word("快进", 1000) self.jieba.add_word("人民的名义") self.jieba.add_word("上一页") self.jieba.add_word("下一页") self.num_dic = NumDict() self.num_dic, self.num_array = self.num_dic.get_num_data() for d in dicts: self.jieba.load_userdict(d) logging.debug('Jieba load user dicts: {}'.format(dicts)) data = list() # 数字 data.append(['上一集', '下一集']) data.append(self.num_array) for f in data: for l in f: jieba.add_word(l) # 加载用户自定义词典 db_conf = { 'host': '192.168.11.122', 'port': 3306, 'password': '******', 'user': '******' } self.db = pymysql.connect(**db_conf, charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) # self.db = conf.get_mysql('ailab') self.seg = jieba.Tokenizer() self.load_userdict() # 词典自定义添加了特定词、用户词典、数字词典 self.seg.add_word("成龙", 1000, 'movie_person') self.seg.add_word("快进", 1000) for k in ['#UNK', '#PAD', '#BOS', '#EOS']: self.seg.add_word(k, 1000) self.seg.add_word("快进", 1000) self.seg.add_word("快进", 1000) self.seg.add_word("快进", 1000) self.seg.add_word("快进", 1000) self.seg.add_word("人民的名义") self.seg.add_word("上一页") self.seg.add_word("下一页") data = list() # 数字 data.append(['上一集', '下一集']) data.append(self.num_array) for f in data: for l in f: self.seg.add_word(l) self.pos = posseg.POSTokenizer(self.seg)
def __init__(self, threading_pool): self._save_dir = os.path.join(config.LEARN_PATH, "{user_id}") self.token = jieba.Tokenizer() self.pos_token = posseg.POSTokenizer(self.token) self.threading_pool = threading_pool self.queue = Queue()
def __init__(self): self.zt = ["不能", "无法", "没法", "不足", "不了", "不够", "不多", "欠费"] # 21 self.zt1 = ["欠费了", "不够了", "不足了", "停机了"] self.noaccount_needadd1 = ["充值了", "充了", "充钱了", "冲了", "交了", "充过"] #19 self.noaccount_needadd2 = ["不能", "无法", "没法", "不足", "不了"] #20 self.noaccount_keywords_1 = [ "充", "充值了", "充了", "冲", "冲了", "交了", "充值", "缴费", "交费", "充话费", "交", "充过" ] #8 self.noaccount_keywords = [ "没显示", "没有显示", "没反应", "没有反应", "没有收到", "没收到", "没增加", "没有增加", "没提示", "没有提示" "停机", "打不出去", "还不通", "还欠费", "没到", "没有到", "欠费", "欠", "停机", "不能", "无法", "没法", "没变", "没钱" ] self.noaccount_keywords_2 = [ "没到账", "没有到账", "没给充上", "没有充上", "不到账", "未到账", "不进账" ] #10 self.nook_keyword2 = ["有", "还有"] self.nook_keyword3 = ["钱", "余额", "元"] self.nook_keyword = [ "只到了", "只到账", "不对", "只有", "只剩", "只", "少了", "不准确", "就剩", "怎么剩", "就还剩", "应该还剩", "不一致", "不一样", "对不上", "打不出去", "打不通", "不能", "无法", "没法", "有问题", "有点问题" ] #9 self.nook_keyword1 = ["话费", "余额", "查话费"] #18 self.noaccount_exceptwords = [ "兑换", "流量", "积分", "宽带", "星级", "游戏", "电子券", "活动", "发票", "充错", "会员", "打印发票", "送" ] self.noaccount_keywords_3 = [ "微信公众号", "公众号", "app", "客户端", "掌厅", "掌上营业厅" ] #17 self.huafei_1 = ["充", "交", "冲", "冲话费", "充流量", "充话费", "充值", "充钱", "缴费"] #1 self.huafei_2 = [ "方法", "哪里", "怎么", "方式", "如何", "渠道", "我想", "我要", "想", "要", "咋", "怎样" ] #2 self.huafei_3 = [ "发票", "开发票", "q", "qb", "qq币", "qq", "打印发票", "记录", "电费", "座机", "固定电话", "会员", "扣币" ] #3 self.huafei_4 = [ "刚", "刚刚", "积分", "宽带", "星级", "游戏", "电子券", "活动", "兑换", "固话", "开机", "漫游", "语音", "扣", "账单", "明细", "详单" ] #16 self.huafei_5 = ["知道", "了解"] self.yue_1 = ["查", "查询"] #11 self.yue_2 = ["话费", "钱", "余额"] #12 self.yue_3 = ["剩", "剩余", "有多少", "还有多少"] #13 self.yue_4 = ["查话费", "余额"] #14 self.tkz = jieba.Tokenizer() from answers.word_dictionary import Dictionary for i in Dictionary.meword: elem = i.split(' ') if len(elem) == 1: self.tkz.add_word(i) elif len(elem) == 2: self.tkz.add_word(elem[0], tag=elem[1]) else: self.tkz.add_word(elem[0], tag=elem[2], freq=int(elem[1])) # self.tkz.load_userdict("meword") self.psegp = pseg.POSTokenizer(self.tkz)
import jieba.posseg as pseg import jieba data = open("data.txt", "r") result = open("result.txt", "w") pos = pseg.POSTokenizer(jieba.Tokenizer(dictionary='dict.txt.big')) for line in data: words = pos.cut(line) for word, flag in words: result.write('%s\%s ' % (word, flag)) break data.close() result.close() words = pos.cut('我的老师说:“我们大家都是好孩子,好孩子应该看《好书》”') for word, flag in words: print('%s %s' % (word, flag))
def __init__(self, user_dict: Union[str, Iterable] = None): self.t = posseg.POSTokenizer() self.t.initialize() self.trie = Trie() if user_dict: self.load_user_dict(user_dict)
import glob import math import json import jieba from jieba import posseg token = jieba.Tokenizer() file = glob.glob("./../jieba_dict/*.txt") for fp in file: token.load_userdict(fp) pos_token = posseg.POSTokenizer(token) file = glob.glob("./*.txt") item_ids = 0 sentence_ids = 0 # 句子条目 word_bag = set() PMI_DICT = {} # key(wj,wi),value log(P(wi|wj)/p(wi)) WI = {} for fp in file: for line in open(fp, "r", encoding="utf-8"): line_list = line.strip().split("####") if len(line_list) != 3: continue query, response, sentiment = line_list sentence_ids += 1 query_token = [word for word, tag in pos_token.lcut(query) if "n" in tag] for to in query_token: word_bag.add(to) response_token = [word for word, tag in pos_token.lcut(response) if "n" in tag] for to in response_token: WI[to] = WI.get(to, 0) + 1 word_bag.add(to)