Exemplo n.º 1
0
 def taste_dict(self):
     data = model.Taste.get_all()
     taste_jieba = jieba.Tokenizer()
     for food in data:
         taste_jieba.add_word(food['name'], 2000, food['type'])
     taste_pseg = pseg.POSTokenizer(taste_jieba)
     print('taste_pseg:success init')
     return taste_pseg
Exemplo n.º 2
0
 def foods_dict(self):
     data = model.Foods.get_all()
     foods_jieba = jieba.Tokenizer()
     for food in data:
         foods_jieba.add_word(food['name'], 2000, food['type'])
     foods_pseg = pseg.POSTokenizer(foods_jieba)
     print('foods_pseg:success init')
     return foods_pseg
Exemplo n.º 3
0
 def material_dict(self):
     data = model.Material.get_all()
     material_jieba = jieba.Tokenizer()
     for food in data:
         material_jieba.add_word(food['name'], 2000, food['parent_code'])
     material_pseg = pseg.POSTokenizer(material_jieba)
     print('material_pseg:success init')
     return material_pseg
Exemplo n.º 4
0
 def technics_dict(self):
     data = model.Technics.get_all()
     technics_jieba = jieba.Tokenizer()
     for food in data:
         technics_jieba.del_word(food['name'])
         # technics_jieba.add_word('是',2000,'ttt')
         technics_jieba.add_word(food['name'], 2000, food['type'])
     technics_pseg = pseg.POSTokenizer(technics_jieba)
     print('technics_pseg:success init')
     return technics_pseg
Exemplo n.º 5
0
def parse_test(p_str):
    ''' call jieba module to parse the text with tags, and transform
    the respective tag to NLPI forms.
    input: text string
    output: results with tags
    '''
    pseg.POSTokenizer(tokenizer=None)
    words = pseg.cut(p_str)
    ret_str = ''
    for word, flag in words:
        ret_str += word +'/' + flag + ' '
    return ret_str.encode('GB18030')
Exemplo n.º 6
0
 def __init__(self, model='jieba'):
     self.model = model
     if model.lower() == 'jieba':
         import jieba.posseg as posseg
         posseg.initialize()
         self.segmentor = posseg.POSTokenizer(tokenizer=None)
     elif model.lower() == 'ictclas':
         import pynlpir
         pynlpir.open()
         self.segmentor = pynlpir
     else:
         raise NotImplementedError
Exemplo n.º 7
0
import jieba.posseg as pseg
words = pseg.cut("中国人民是不可战胜的")  #词性标注.
words1 = pseg.POSTokenizer(tokenizer=pseg.dt)
for word, flag in words:
    print('%s %s' % (word, flag))
print(words1)
Exemplo n.º 8
0
# -*- coding: UTF-8 -*-
import sys
import jieba.posseg as pseg
import jieba
import argparse

t = pseg.POSTokenizer()
jieba.initialize()


def segment_nodict(string):
    """
    输入为要分词的句子,输出分词后和标注后的结果
    :param string: the string that need to be segment
    :return: 分词后的结果,用一个列表表示,列表的每一个元素是一个元组,元组第一维表示分好后的词,第二维表示词性。
    """
    segment_words = t.cut(string)
    segment_list = []
    for i in segment_words:
        segment_list.append((i.word, i.flag))
    return segment_list


if __name__ == '__main__':
    with open('jiebapos_union3.txt', 'r') as f:
        lines = f.readlines()
    str = ''
    for index, item in enumerate(lines):
        if item != '\n':
            word = item.split('\t')[0].strip()
        else:
Exemplo n.º 9
0
import jieba
from jieba import posseg

from .cmd_tools import tika_convert, antiword_convert
from . import _config
from .data_helps import corpus_cut

__all__ = (
    "Files2Text",
    "CorpusClean",
    "CorpusHandler"
)

jieba.setLogLevel(logging.INFO)
jieba.enable_parallel()
jieba_inst = posseg.POSTokenizer(jieba.Tokenizer())


class ReCache:
    N_RE = re.compile(r'\n')
    MORE_SPACE_T_RE = re.compile(r'[\s\t]+')
    CN_RE = re.compile(r'^[\u4e00-\u9fa5]+$')


class BaseCorpus(object):

    def __init__(self, logger=None):
        if logger is None:
            logging.basicConfig(level=logging.DEBUG,
                                format=_config.LOG_FORMAT,
                                datefmt=_config.LOG_DATE_FORMAT)
Exemplo n.º 10
0
    def __init__(self, dicts=list()):
        self.jieba = jieba

        # 词典自定义添加了特定词、用户词典、数字词典
        self.jieba.add_word("成龙", 1000, 'movie_person')
        self.jieba.add_word("快进", 1000)
        for k in ['#UNK', '#PAD', '#BOS', '#EOS']:
            self.jieba.add_word(k, 1000)
        self.jieba.add_word("快进", 1000)
        self.jieba.add_word("快进", 1000)
        self.jieba.add_word("快进", 1000)
        self.jieba.add_word("快进", 1000)
        self.jieba.add_word("人民的名义")
        self.jieba.add_word("上一页")
        self.jieba.add_word("下一页")

        self.num_dic = NumDict()

        self.num_dic, self.num_array = self.num_dic.get_num_data()

        for d in dicts:
            self.jieba.load_userdict(d)

        logging.debug('Jieba load user dicts: {}'.format(dicts))

        data = list()
        # 数字
        data.append(['上一集', '下一集'])
        data.append(self.num_array)

        for f in data:
            for l in f:
                jieba.add_word(l)

        # 加载用户自定义词典
        db_conf = {
            'host': '192.168.11.122',
            'port': 3306,
            'password': '******',
            'user': '******'
        }
        self.db = pymysql.connect(**db_conf,
                                  charset='utf8mb4',
                                  cursorclass=pymysql.cursors.DictCursor)
        # self.db = conf.get_mysql('ailab')
        self.seg = jieba.Tokenizer()
        self.load_userdict()
        # 词典自定义添加了特定词、用户词典、数字词典
        self.seg.add_word("成龙", 1000, 'movie_person')
        self.seg.add_word("快进", 1000)
        for k in ['#UNK', '#PAD', '#BOS', '#EOS']:
            self.seg.add_word(k, 1000)
        self.seg.add_word("快进", 1000)
        self.seg.add_word("快进", 1000)
        self.seg.add_word("快进", 1000)
        self.seg.add_word("快进", 1000)
        self.seg.add_word("人民的名义")
        self.seg.add_word("上一页")
        self.seg.add_word("下一页")
        data = list()
        # 数字
        data.append(['上一集', '下一集'])
        data.append(self.num_array)

        for f in data:
            for l in f:
                self.seg.add_word(l)

        self.pos = posseg.POSTokenizer(self.seg)
Exemplo n.º 11
0
 def __init__(self, threading_pool):
     self._save_dir = os.path.join(config.LEARN_PATH, "{user_id}")
     self.token = jieba.Tokenizer()
     self.pos_token = posseg.POSTokenizer(self.token)
     self.threading_pool = threading_pool
     self.queue = Queue()
Exemplo n.º 12
0
    def __init__(self):
        self.zt = ["不能", "无法", "没法", "不足", "不了", "不够", "不多", "欠费"]  # 21
        self.zt1 = ["欠费了", "不够了", "不足了", "停机了"]

        self.noaccount_needadd1 = ["充值了", "充了", "充钱了", "冲了", "交了", "充过"]  #19
        self.noaccount_needadd2 = ["不能", "无法", "没法", "不足", "不了"]  #20
        self.noaccount_keywords_1 = [
            "充", "充值了", "充了", "冲", "冲了", "交了", "充值", "缴费", "交费", "充话费", "交",
            "充过"
        ]  #8
        self.noaccount_keywords = [
            "没显示", "没有显示", "没反应", "没有反应", "没有收到", "没收到", "没增加", "没有增加", "没提示",
            "没有提示"
            "停机", "打不出去", "还不通", "还欠费", "没到", "没有到", "欠费", "欠", "停机", "不能",
            "无法", "没法", "没变", "没钱"
        ]
        self.noaccount_keywords_2 = [
            "没到账", "没有到账", "没给充上", "没有充上", "不到账", "未到账", "不进账"
        ]  #10

        self.nook_keyword2 = ["有", "还有"]
        self.nook_keyword3 = ["钱", "余额", "元"]

        self.nook_keyword = [
            "只到了", "只到账", "不对", "只有", "只剩", "只", "少了", "不准确", "就剩", "怎么剩",
            "就还剩", "应该还剩", "不一致", "不一样", "对不上", "打不出去", "打不通", "不能", "无法",
            "没法", "有问题", "有点问题"
        ]  #9
        self.nook_keyword1 = ["话费", "余额", "查话费"]  #18

        self.noaccount_exceptwords = [
            "兑换", "流量", "积分", "宽带", "星级", "游戏", "电子券", "活动", "发票", "充错", "会员",
            "打印发票", "送"
        ]
        self.noaccount_keywords_3 = [
            "微信公众号", "公众号", "app", "客户端", "掌厅", "掌上营业厅"
        ]  #17

        self.huafei_1 = ["充", "交", "冲", "冲话费", "充流量", "充话费", "充值", "充钱",
                         "缴费"]  #1
        self.huafei_2 = [
            "方法", "哪里", "怎么", "方式", "如何", "渠道", "我想", "我要", "想", "要", "咋", "怎样"
        ]  #2
        self.huafei_3 = [
            "发票", "开发票", "q", "qb", "qq币", "qq", "打印发票", "记录", "电费", "座机",
            "固定电话", "会员", "扣币"
        ]  #3
        self.huafei_4 = [
            "刚", "刚刚", "积分", "宽带", "星级", "游戏", "电子券", "活动", "兑换", "固话", "开机",
            "漫游", "语音", "扣", "账单", "明细", "详单"
        ]  #16
        self.huafei_5 = ["知道", "了解"]

        self.yue_1 = ["查", "查询"]  #11
        self.yue_2 = ["话费", "钱", "余额"]  #12
        self.yue_3 = ["剩", "剩余", "有多少", "还有多少"]  #13
        self.yue_4 = ["查话费", "余额"]  #14

        self.tkz = jieba.Tokenizer()
        from answers.word_dictionary import Dictionary
        for i in Dictionary.meword:
            elem = i.split(' ')
            if len(elem) == 1:
                self.tkz.add_word(i)
            elif len(elem) == 2:
                self.tkz.add_word(elem[0], tag=elem[1])
            else:
                self.tkz.add_word(elem[0], tag=elem[2], freq=int(elem[1]))
            # self.tkz.load_userdict("meword")
        self.psegp = pseg.POSTokenizer(self.tkz)
Exemplo n.º 13
0
import jieba.posseg as pseg
import jieba

data = open("data.txt", "r")
result = open("result.txt", "w")
pos = pseg.POSTokenizer(jieba.Tokenizer(dictionary='dict.txt.big'))
for line in data:
    words = pos.cut(line)
    for word, flag in words:
        result.write('%s\%s ' % (word, flag))
    break
data.close()
result.close()
words = pos.cut('我的老师说:“我们大家都是好孩子,好孩子应该看《好书》”')
for word, flag in words:
    print('%s %s' % (word, flag))
Exemplo n.º 14
0
 def __init__(self, user_dict: Union[str, Iterable] = None):
     self.t = posseg.POSTokenizer()
     self.t.initialize()
     self.trie = Trie()
     if user_dict:
         self.load_user_dict(user_dict)
Exemplo n.º 15
0
import glob
import math
import json
import jieba
from jieba import posseg

token = jieba.Tokenizer()
file = glob.glob("./../jieba_dict/*.txt")
for fp in file:
    token.load_userdict(fp)
pos_token = posseg.POSTokenizer(token)
file = glob.glob("./*.txt")
item_ids = 0
sentence_ids = 0  # 句子条目
word_bag = set()
PMI_DICT = {}  # key(wj,wi),value  log(P(wi|wj)/p(wi))
WI = {}
for fp in file:
    for line in open(fp, "r", encoding="utf-8"):
        line_list = line.strip().split("####")
        if len(line_list) != 3:
            continue
        query, response, sentiment = line_list
        sentence_ids += 1
        query_token = [word for word, tag in pos_token.lcut(query) if "n" in tag]
        for to in query_token:
            word_bag.add(to)
        response_token = [word for word, tag in pos_token.lcut(response) if "n" in tag]
        for to in response_token:
            WI[to] = WI.get(to, 0) + 1
            word_bag.add(to)