Пример #1
0
def separate_xwlbotxt(file_str,
                      level=1,
                      coding="utf-8",
                      add_words=[],
                      del_words=[]):
    '''
    :param file_str:txt file or parsing text
    :param level:separate result mode, 1 is return lis, 2 is iterator
    :param coding:read file encoding
    :return: list or iterator
    '''
    parsing_str = ""
    if pathlib.Path(file_str).is_file():
        with open(file_str, "r", encoding=coding) as f:
            for line in f:
                parsing_str += line.strip()
    elif isinstance(file_str, str):
        parsing_str = file_str
    else:
        return None
    if add_words:
        for word in add_words:
            jieba.add_word(word)
    if del_words:
        for word in del_words:
            jieba.del_word(word)
    return jieba.lcut(parsing_str)
Пример #2
0
def Initialization():
    jieba.suggest_freq('采购单', True)
    jieba.suggest_freq('采购提交', True)
    jieba.suggest_freq('玻尿酸', True)
    jieba.suggest_freq("新增采购", True)
    jieba.suggest_freq("水泥", True)
    jieba.del_word('采购提交水泥')
Пример #3
0
def better_cut(one_string, discover_new_word=False):
    one_string = re.sub(r'\s+', '', one_string)  # 去掉所有空格
    final_result = []
    temp_list = jieba.lcut(one_string, HMM=discover_new_word)
    if discover_new_word == False:  # HMM=False已实际使之缩小了不少粒度
        for word in temp_list:
            if isAllZh(word) == False:
                continue
            if len(word) > 4:
                jieba.del_word(word)  # jieba.add_word(word,freq=0) 也行!
                final_result.extend(jieba.lcut(word, HMM=discover_new_word))
            else:
                final_result.append(word)
    else:
        for word in temp_list:
            if isAllZh(word) == False:
                continue
            # if len(word)==4: # 根据词频设置阈值
            #     print(word,jieba.get_FREQ(word))
            if jieba.get_FREQ(word)==None \
                    or (len(word)>1 and (jieba.get_FREQ(word)==None or jieba.get_FREQ(word)==0)) \
                    or len(word)>4 \
                    or (len(word)==4 and jieba.get_FREQ(word)!=None and jieba.get_FREQ(word)<100):
                jieba.del_word(word)  # 强制 # jieba.add_word(word,freq=0) 也行!
                final_result.extend(jieba.lcut(word))
            else:
                final_result.append(word)
    return final_result
def addDictToJieba():
    ##### roadList
    content=open('../data_crawl/finalRoads.txt','r').read().strip('\n')
    contentList=content.split('\n');print len(contentList)
    #############load district dict
    districtNameList=grab('/home/yr/intellicredit/data/'+'districtNameList0503')
    test_sent = [
    "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿,上海市浦东区\n"
    ]
    ######## print cut before add dictionary
    #print test_sent[0].decode('utf-8')
    words = jieba.cut(test_sent[0].decode('utf-8'))
    #print('/'.join(words))

    ####add word_dictionary to jieba
    for w in districtNameList[:]+contentList:
	    #print w
	    jieba.add_word(w)
        ####add district-name-not-in-dictionary to jieba
    jieba.add_word('浦东区');
    jieba.add_word('浦东新区')
    jieba.del_word('上海市')
    jieba.add_word('兰城路')



    words = jieba.cut(test_sent[0].decode('utf-8'))
Пример #5
0
def get_subword_list(big_word):
    if not isZH(big_word[0]):
        return big_word
    if len(big_word)>4:
        jieba.del_word(big_word)
        return jieba.lcut(big_word, HMM=False)
    else:
        return big_word
Пример #6
0
    def __init__(self, cus_files=None):

        if type(cus_files) is list:
            for cf in cus_files:
                for s in open(cf,'r'):
                    del_word(s.split()[0])
                load_userdict(cf)
        assert type(cus_files) is list, "cus_files must be a files list"
Пример #7
0
    def sentence_segmentation(self, sentence, entity1, entity2):
        jieba.add_word(entity1, freq=999999)
        jieba.add_word(entity2, freq=999999)

        seglist = list(jieba.cut(sentence, cut_all=False, HMM=False))
        jieba.del_word(entity1)
        jieba.del_word(entity2)
        return seglist
Пример #8
0
    def test_parse_sentence(self):
        self.assertTrue(True)

        import jieba
        jieba.del_word('价格便宜')

        sent_txt = '价格便宜。'
        for sent in parser.parse2sents(sent_txt):
            print('sent: ', sent)
Пример #9
0
def load_data_and_labels(train_file_org='atec_nlp_sim_train.csv',
                         train_file_add='atec_nlp_sim_train_add.csv',
                         word_dict_file='word_dict',
                         userdict='userdict1.txt',
                         less_frequency=5):
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    #加载结巴分词,如果单词出现次数很少,且不能被字典识别,则不将它作为一个分词单位
    jieba.load_userdict(userdict)
    word_f_less = torch.load('word_f_less')
    for word in word_f_less:
        jieba.del_word(word)

    train_data_org = list(
        open(train_file_org, "r", encoding='utf-8').readlines())
    train_data_add = list(
        open(train_file_add, "r", encoding='utf-8').readlines())
    train_data = train_data_org + train_data_add
    train_sentence = []

    for i in range(0, len(train_data)):
        if train_data[i]:
            seg_list1 = jieba.lcut(train_data[i].split('\t')[1], cut_all=False)
            seg_list2 = jieba.lcut(train_data[i].split('\t')[2], cut_all=False)
            train_sentence.append([[seg_list1, seg_list2],
                                   float(train_data[i].split('\t')[-1])])
        else:
            continue
        if i % 1000 == 0:
            print('transfering sentence to list', i, '/', len(train_data),
                  'had been solved')

#导入处理后的预训练字典{单词:向量}
    word_dict = torch.load('word_dict')
    word_dict['padding'] = np.zeros((200), dtype=np.float64)
    word_dict['unknow'] = np.zeros((200), dtype=np.float64)

    #构建单词与索引间互相查询的字典
    word2ix = {word: ix for ix, word in enumerate(word_dict.keys())}
    ix2word = {ix: word for word, ix in word2ix.items()}

    new_word, word_f_less = find_new_word_and_frequency(
        train_sentence, word_dict, less_frequency)

    train_sentence = delete_new_word(train_sentence, word_f_less)
    word_matrix = make_word_matrix(word_dict).float()
    train_sentence, valid_sentence, test_sentence = get_train_and_valid(
        train_sentence)

    del train_data, train_data_add, train_data_org
    del train_file_add, train_file_org

    return train_sentence, valid_sentence, test_sentence, word_dict, word2ix, ix2word, word_matrix
Пример #10
0
def make_worldcloud(file_path):
    jieba.add_word('少女心')
    jieba.add_word('颜值')
    jieba.del_word('男主')
    jieba.del_word('女主')
    text_from_file_with_apath = io.open(file_path, 'r',
                                        encoding='UTF-8').read()
    wordlist_after_jieba = jieba.cut(text_from_file_with_apath,
                                     cut_all=False)  #精确模式
    # wl_space_split = " ".join(wordlist_after_jieba)
    # print(wl_space_split)
    backgroud_Image = plt.imread('./dou1.jpg')
    # print('loaded ipg!')
    '''set wordcloud mode'''
    #stopwords = STOPWORDS.copy()
    # stopwords = set(STOPWORDS)
    stopwords = stopwordslist('./stopwords.txt')  # 这里加载停用词的路径
    # stopwords.add("哈哈")
    # stopwords.add("就是")
    # stopwords.add("电视剧")
    # stopwords.add("男主")
    # stopwords.add("女主")
    # stopwords.add("还是")#单项添加
    # stopwords.update([u'哈哈', u'就是', u'男主', u'女主', u'还是', u'电视剧'])#多项添加

    outstr = ''
    for word in wordlist_after_jieba:
        if word not in stopwords:
            if word != '\t':
                outstr += word
                outstr += " "
    print outstr

    wc = WordCloud(
        width=1024,
        height=768,
        background_color='white',  # 设置背景颜色
        mask=backgroud_Image,  # 设置背景图片
        font_path=
        '/Library/Fonts/华文仿宋.ttf',  # 设置中文字体,若是有中文的话,这句代码必须添加,不然会出现方框,不出现汉字
        max_words=300,  # 设置最大现实的字数
        #stopwords=stopwords,# 设置停用词
        #stopwords=STOPWORDS.add('男主'),# 设置停用词
        random_state=50,  # 设置有多少种随机生成状态,即有多少种配色方案
    )
    #wc.generate_from_text(wl_space_split)#开始加载文本
    wc.generate_from_text(outstr)
    img_colors = ImageColorGenerator(backgroud_Image)
    wc.recolor(color_func=img_colors)  #字体颜色为背景图片的颜色
    plt.imshow(wc)  # 显示词云图
    plt.axis('off')  # 是否显示x轴、y轴下标
    plt.show()  #显示
    # 获得模块所在的路径的
    d = path.dirname(__file__)
    wc.to_file(path.join(d, "test.jpg"))
    print('word cloud!')
Пример #11
0
def init():
    del_list = ["订单", "取消"]  #移除单词
    suggest_list = ["取消订单", "单方面取消", "单方取消", "单方面违约", "无故退款", "不发货",
                    "按时发货"]  #增加词库

    for word in del_list:
        jieba.del_word(word)

    for word in suggest_list:
        jieba.suggest_freq(word, True)
Пример #12
0
 def shorter_chinese_cut(self, line):
     result = []
     for long_word in jieba.lcut(line, HMM=False):
         cp = ord(long_word[0])
         if self._is_chinese_char(cp) and len(long_word) > 3:
             jieba.del_word(long_word)
             result.extend(jieba.lcut(long_word))
         else:
             result.append(long_word)
     return result
Пример #13
0
 def __init__(self, save_path):
     add_words_file = os.path.join(save_path, '_add_words.txt')
     del_words_file = os.path.join(save_path, '_del_words.txt')
     with open(add_words_file, "r", encoding='utf-8') as f:
         for word in f:
             word = word.strip().lstrip('\ufeff')
             jieba.add_word(word)
     with open(del_words_file, "r", encoding='utf-8') as f:
         for word in f:
             word = word.strip().lstrip('\ufeff')
             jieba.del_word(word)
Пример #14
0
def cut_text(text):
    stopwords = [
        '熟悉', '技术', '职位', '相关', '工作', '开发', '使用', '能力', '优先', '描述', '任职', '经验',
        '经验者', '具有', '具备', '以上', '善于', '一种', '以及', '一定', '进行', '能够', '我们'
    ]
    for stopword in stopwords:
        jieba.del_word(stopword)

    words = jieba.lcut(text)
    content = " ".join(words)
    return content
Пример #15
0
    def __load_dict(self):
        """
        Function: 加载yaml文件,获取jieba分词词库配置的词集

        :return: 分句的依赖字典,list形式
        """
        f = open(CUT_WORD_FILE, encoding='utf-8')
        word_dict = yaml.load(f)
        for del_words in word_dict.get("del_word", []):
            jieba.del_word(del_words)
        for add_words in word_dict.get("add_word", []):
            jieba.add_word(add_words)
Пример #16
0
def key_name_split(input_key_name_cn):
    jieba.del_word('总金额')
    jieba.load_userdict('E:/cai_project/python/test_file/keyword_dict.txt')
    seq_list = jieba.cut(input_key_name_cn.replace('的', ''))
    split_word_list = list(seq_list)
    number_index = []
    for key in range(len(split_word_list)):
        if re.match('\\d+', split_word_list[key]):
            number_index.append(key)
        else:
            print split_word_list[key]

    return '_'.join(split_word_list)
Пример #17
0
 def setdict(self, cmd):
     arr = cmd.split(':')
     if len(arr) == 3:
         if arr[1] == 'add':
             jieba.add_word(arr[2])
             return '添加词典【' + arr[2] + '】成功!'
         elif arr[1] == 'del':
             jieba.del_word(arr[2])
             return '删除词典【' + arr[2] + '】成功!'
         else:
             return '错误的命令!'
     else:
         return '错误的命令!'
Пример #18
0
def word_seg(s):
    #tukai
    source = read_data_test('source.csv')
    for i in range(860):
        source[i] = source[i][0]
    add = ['汪建', '姜广策', '藻酸双酯钠'] + source
    delate = ['了', '的']
    for x in add:
        jieba.add_word(x)
    for x in delate:
        jieba.del_word(x)
    str_list = list(jieba.cut(s, cut_all=False, HMM=False))
    return str_list
Пример #19
0
def word_seg(s):
    add = ['汪建', '姜广策', '藻酸双酯钠', '盘口'] + source
    delate = ['了', '的']
    for x in add:
        jieba.add_word(x)
    for x in delate:
        jieba.del_word(x)
    str_list = list(jieba.cut(s, cut_all=False, HMM=False))
    s_list = []
    for x in str_list:
        if x not in stop_word:
            s_list.append(x)
    return s_list
Пример #20
0
async def reply(session: CommandSession):
    message = session.state.get('message').replace(' ', '')
    if message in ('电话问题', '电脑问题', '网络问题'):
        table = '58_robot_1'
        answer = await database_search(session, table, message)
    elif 'add-' in message:
        key = message.split('-')[1]
        if key != '':
            with open(base_dir + '/dict.txt', 'a') as k:
                k.write(key + ' ' + '10' + '\n')
            jieba.add_word(key)
            answer = '关键词已经激活'
        else:
            answer = '请按照此格式激活:add-关键词'
    elif 'del-' in message:
        dict_txt = []
        key = message.split('-')[1]
        if key != '':
            fp = open(base_dir + '/dict.txt', 'r')
            txt = fp.readlines()
            for i in txt:
                if key in i:
                    jieba.del_word(key)
                else:
                    dict_txt.append(i)
            fp.close()

            with open(base_dir + '/dict.txt', 'w+') as fp:
                for i in dict_txt:
                    fp.write(i)
            answer = '关键词已经删除'
        else:
            answer = '请按照此格式删除:del-关键词'
    else:
        table = '58_robot_2'
        des = supplement
        answer = await database_search(session, table, message) + '\n' + des

    if EXPR_DONT_UNDERSTAND not in answer:
        print(answer)
        await session.send(answer)
        msg_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        # 聊天记录写入数据库

        try:
            cursor.execute(
                'insert into 58_robot_3 (im, time, question, answer) values ("QQ", "{}", "{}", "{}")'
                .format(msg_time, message, answer))
            db.commit()
        except:
            db.rollback()
Пример #21
0
def cloud(text, itemid):
    # 存放照片途径
    fileroute = imageRoute + itemid + ".jpg"
    # jieba分词
    removes = [
        '团购', '点评', '但是', '还是', '感觉', '就是', '而且', '没有', '还有', '不过', '知道', '什么',
        '比较', '这里'
        '我们', '以前', '一下', '一次', '不是', '不是', '我们', '时候', '老板', '里面', '很多', '朋友',
        '一个', '这家', '购点评', '有点', '觉得', '东西', '个人', '绵阳', '这个', '下次', '因为', '位置'
    ]
    for w in removes:
        jieba.del_word(w)
    words = jieba.lcut(text)
    cuted = ' '.join(words)
    # wordCloud 生成词云
    fontpath = "SourceHanSansCN-Regular.otf"
    #backgroud_Image = plt.imread('cloud.jpg')
    wc = WordCloud(
        background_color='black',  # 设置背景颜色
        # mask=backgroud_Image,  # 设置背景图片
        max_words=33,  # 设置最大现实的字数
        stopwords=STOPWORDS,  # 设置停用词
        font_path=fontpath,  # 设置字体格式,如不设置显示不了中文
        max_font_size=300,  # 设置字体最大值
        min_font_size=50,  # 设置字体最小值
        #random_state=42,  # 设置有多少种随机生成状态,即有多少种配色方案
        mode='RGBA',
        colormap='Spectral',
        collocations=False,  # 避免重复的单词
        width=1590,
        height=1205,
        margin=20,  # 设置图像宽高,字体间距
    )
    wc.generate(cuted)
    # image_colors = ImageColorGenerator(backgroud_Image)
    # wc.recolor(color_func=image_colors)
    fig, ax = plt.subplots()
    plt.figure(dpi=100)
    plt.imshow(wc, interpolation='catrom', vmax=1000)
    plt.axis('off')
    height, width = wc.height, wc.width
    # 如果dpi=300,那么图像大小=height*width
    fig.set_size_inches(width / 100.0 / 3.0, height / 100.0 / 3.0)
    plt.gca().xaxis.set_major_locator(plt.NullLocator())
    plt.gca().yaxis.set_major_locator(plt.NullLocator())
    plt.subplots_adjust(top=1, bottom=0, left=0, right=1, hspace=2, wspace=2)
    plt.margins(0, 0)
    plt.savefig(fileroute)
    #plt.show()
    return fileroute
Пример #22
0
def jieba_test():
    """

    :return:
    """
    jieba.load_userdict("./dict/user_dict.txt")

    jieba.add_word('石墨烯')
    jieba.add_word('凱特琳')
    jieba.del_word('自定义词')

    test_sent = ("李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿\n"
                 "例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类\n"
                 "「台中」正確應該不會被切開。mac上可分出「石墨烯」;此時又可以分出來凱特琳了。")
    words = jieba.cut(test_sent)
    print('/'.join(words))

    print("=" * 40)

    result = pseg.cut(test_sent)

    for w in result:
        print(w.word, "/", w.flag, ", ", end=' ')

    print("\n" + "=" * 40)

    terms = jieba.cut('easy_install is great')
    print('/'.join(terms))
    terms = jieba.cut('python 的正则表达式是好用的')
    print('/'.join(terms))

    print("=" * 40)
    # test frequency tune
    testlist = [
        ('今天天气不错', ('今天', '天气')),
        ('如果放到post中将出错。', ('中', '将')),
        ('我们中出了一个叛徒', ('中', '出')),
    ]

    for sent, seg in testlist:
        print('/'.join(jieba.cut(sent, HMM=False)))
        word = ''.join(seg)
        print('%s Before: %s, After: %s' %
              (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True)))
        print('/'.join(jieba.cut(sent, HMM=False)))
        print("-" * 40)

    return None
Пример #23
0
 def __init__(self, del_file="jieba_del.txt", areacode_file="areacode.txt", area_json_file="china_city_area.json",
              baidu_ak="eXiTVqhBbnU7TeF3WrtGAvxXkIUXBRwg"):
     self.cities = dict()
     with open(os.path.join(self.current_path, del_file), encoding="utf8") as file:
         for word in file:
             jieba.del_word(word)
     with open(os.path.join(self.current_path, areacode_file), encoding="utf8") as file:
         for line in file:
             city, areacode = line.replace("\r", "").replace("\n", "").split(",")
             self.cities[city] = areacode
     with open(os.path.join(self.current_path, area_json_file), encoding="utf8") as file:
         self.area = json.load(file)
     self.geocoder_url_template = "http://api.map.baidu.com/geocoder/v2/?address={addr}&output=json&ak=" + baidu_ak
     self.city_url_template = "http://api.map.baidu.com/geocoder/v2/?ak=" + baidu_ak \
                              + "&location={lat},{lng}&output=json"
     self.session = requests.session()
Пример #24
0
def getContent(table_name):
    db = dbHandle()
    jieba.del_word("电影")
    jieba.del_word("导演")
    jieba.del_word("没有")
    jieba.del_word("影片")
    jieba.del_word("看到")
    query_sql = "select comment_content from {0}".format(table_name)
    texts = db.query_db(query_sql)
    text = ''
    for t in texts:
        text += t[0]
    #result = jieba.analyse.textrank(text, topK=1000, withWeight=True)
    result = jieba.cut(text, cut_all=True)
    wl_space_split = " ".join(result)
    return wl_space_split
Пример #25
0
def del_sentiment_dict():
    for word in sentiment_emotion_dict:
        jieba.del_word(word)
    for word in sentiment_privative_dict:
        jieba.del_word(word)
    for word in sentiment_transitional_dict:
        jieba.del_word(word)
    for word in sentiment_degree_dict:
        jieba.del_word(word)
Пример #26
0
def process(test_path, result_path):
    jieba.load_userdict("./user.dict")
    jieba.del_word('元用')
    word2idx = load_word2idx("./word2idx.dict")

    source_inputs = []
    target_inputs = []
    all_lineno = []
    with open(test_path, 'r', encoding='utf-8') as fin:
        for line in fin:
            lineno, sen1, sen2 = line.encode('utf-8').decode(
                'utf-8-sig').strip().split('\t')
            idx1 = [
                word2idx.get(w, word2idx['<UNK>']) for w in jieba.cut(sen1)
                if w.strip()
            ]
            idx2 = [
                word2idx.get(w, word2idx['<UNK>']) for w in jieba.cut(sen2)
                if w.strip()
            ]
            all_lineno.append(lineno)

            def standard_length(idx):
                if len(idx) > seq_length:
                    idx = idx[:seq_length]
                else:
                    for i in range(len(idx), seq_length):
                        idx.append(word2idx['<UNK>'])
                return idx

            source_inputs.append(standard_length(idx1))
            target_inputs.append(standard_length(idx2))

    model = build_model(word2idx)
    model.load_weights('./atec.model')
    logits = model.predict([source_inputs, target_inputs])

    predicts = np.reshape(np.argmax(logits, axis=1), -1)

    with open(result_path, 'r', encoding='utf-8') as f:
        results = []
        for line in f:
            _, score = line.strip().split('\t')
            results.append(int(score))

    print(classification_report(results, predicts))
Пример #27
0
def label(file,wordlist,classifier):

	lf = []
	namelist4text = []
	#keyword(file)
	countr = 0
	countf = 0

	list4nn = discorverynewword.dis_new_word(file,wordlist)
	
	for inn in list4nn:
		pred = classifier.predict(tran.input_word(wordlist,file,inn))

		if int(pred[0]) != 0:
			namelist4text.append(inn)
	#namelist4text = list4nn
	for inn in namelist4text:
		jieba.add_word(inn, freq=1000, tag='nn')
	with open('./novel/'+file) as f:
		str4text = f.read()
		# 以两个回车为一个block进行分割
		ls4block = str4text.strip().split('\n\n')

		for block in ls4block:
			lb = dealblock(file,block,wordlist,namelist4text)
			lf.append(lb)
			# 每个 ls 是一个block
			# 打印输出 file 结果
	with open('./result/label_'+file,'w') as f:
		for each in lf:
			f.write('\n')
			if each != -1:
				countr += 1
				for r in each:
					f.write(r+'\n')
			else:
				countf += 1
	

	for inn in namelist4text:
		jieba.del_word(inn)
		print('delete word ' +inn+' successful!')

	print(str(countr+countf)+' blocks complete!')

	return countr,countf
Пример #28
0
    def Get_fenci(self):

        # jieba.add_word('石墨烯')#动态添加自定义单词
        jieba.add_word('凱特琳')
        jieba.del_word('自定义词')
        jieba.add_word("易风化")
        filtered_tokens = []
        test_sent = ""
        for i in range(1,2):
            Data_path = path + "he"+".txt"
            test_sent ="".join(open(Data_path, 'rb').read())
        print (test_sent)
        words = jieba.cut(test_sent)
        filtered_tokens.append([each for each in jieba.cut(test_sent)])
        print ('-'*40)
        print (json.dumps(filtered_tokens))
        print("="*40)
Пример #29
0
    def Get_fenci(self):

        # jieba.add_word('石墨烯')#动态添加自定义单词
        jieba.add_word('凱特琳')
        jieba.del_word('自定义词')
        jieba.add_word("易风化")
        filtered_tokens = []
        test_sent = ""
        for i in range(1, 2):
            Data_path = path + "he" + ".txt"
            test_sent = "".join(open(Data_path, 'rb').read())
        print(test_sent)
        words = jieba.cut(test_sent)
        filtered_tokens.append([each for each in jieba.cut(test_sent)])
        print('-' * 40)
        print(json.dumps(filtered_tokens))
        print("=" * 40)
Пример #30
0
def key_name_split(input_key_name_cn):
    get_oradata = GetOracleData()
    jieba.del_word('总金额')
    jieba.load_userdict('E:/cai_project/python/test_file/keyword_dict.txt')
    seq_list = jieba.cut(input_key_name_cn.replace('的', ''))
    split_word_list = list(seq_list)
    standard_word_cn = []
    business_word_list = []
    kpi_word_list = []
    aggr_word_list = []
    cycle_word_list = []
    for key in range(len(split_word_list)):
        if re.match('\\d+', split_word_list[key]):
            split_word_list.append(split_word_list[key])
        else:
            root_result = get_oradata.get_root_word(split_word_list[key])
            standard_word_cn.append(root_result['WORD_NAME_CN'])
            if key > 0 and re.match('\\d+', split_word_list[
                    key - 1]) and root_result['WORD_TYPE'] == '周期修饰词':
                root_result['WORD_NAME_EN'] = split_word_list[
                    key - 1] + root_result['WORD_NAME_EN']
                root_result['WORD_NAME_CN'] = split_word_list[
                    key - 1] + root_result['WORD_NAME_CN']
                root_result['WORD_NAME_EN_ABBR'] = split_word_list[
                    key - 1] + root_result['WORD_NAME_EN_ABBR']
                cycle_word_list.append(root_result)
            elif root_result['WORD_TYPE'] == '周期修饰词':
                cycle_word_list.append(root_result)
            elif root_result["WORD_TYPE"] == '业务修饰词':
                business_word_list.append(root_result)
            elif root_result["WORD_TYPE"] == '指标修饰词':
                kpi_word_list.append(root_result)
            elif root_result["WORD_TYPE"] == '聚合修饰词':
                aggr_word_list.append(root_result)

    get_oradata.conn_close()
    standard_word_list = business_word_list + kpi_word_list + aggr_word_list + cycle_word_list
    standard_word_en = [x['WORD_NAME_EN'] for x in standard_word_list]
    standard_word_en_abbr = [
        x['WORD_NAME_EN_ABBR'] for x in standard_word_list
    ]
    # print '_'.join(standard_word_en)
    # print '_'.join(standard_word_en_abbr)
    return '_'.join(standard_word_en), '_'.join(standard_word_en_abbr)
Пример #31
0
def cleanup_data(data_path: str):
    """clean up data to desired format"""
    stopwords = get_stopwords(path=f'{data_path}/stopwords.txt')
    parse_strategy = None

    with open(f'{data_path}/dictionary', 'r', encoding='utf-8') as dictionary, \
      open(f'{data_path}/not_word', 'r', encoding='utf-8') as not_word:

        dictionary_lines = dictionary.readlines()
        not_word_lines = not_word.readlines()

        for dictionary_line in dictionary_lines:
            dictionary_line = dictionary_line.strip()
            jieba.add_word(dictionary_line)

        for not_word_line in not_word_lines:
            not_word_line = not_word_line.strip()
            jieba.del_word(not_word_line)

    with open(f'{data_path}/ant_train', 'r', encoding='utf-8') as ant_train, \
      open(f'{data_path}/ant_train_add', 'r', encoding='utf-8') as ant_train_add, \
      open(f'{data_path}/epidemic_dev.csv', 'r', encoding='utf-8') as epidemic_dev, \
      open(f'{data_path}/epidemic_train.csv', 'r', encoding='utf-8') as epidemic_train, \
      open(f'{data_path}/icqmc_train.txt', 'r', encoding='utf-8') as icqmc_train, \
      open(f'{data_path}/icqmc_dev.txt', 'r', encoding='utf-8') as icqmc_dev, \
      open(f'{data_path}/icqmc_test.txt', 'r', encoding='utf-8') as icqmc_test, \
      open(f'{data_path}/simtrain_to05sts.txt', encoding='utf-8') as simtrain:

        ant_train_lines = ant_train.readlines() + ant_train_add.readlines()
        epidemic_lines = epidemic_dev.readlines(
        )[1:] + epidemic_train.readlines()[1:]
        icqmc_lines = icqmc_train.readlines()[1:] + icqmc_dev.readlines(
        )[1:] + icqmc_test.readlines()[1:]

        sentences = []

        sentences += cleanup_corpus(epidemic_lines, ',', 2, 5, stopwords)
        sentences += cleanup_corpus(ant_train_lines, '\t', 1, 4, stopwords)
        sentences += cleanup_corpus(icqmc_lines, '\t', 0, 3, stopwords)

        with open(f'./preprocessed/data', 'a+', encoding='utf-8') as ant_file:
            for sentence in sentences:
                ant_file.write(f'{sentence}\n')
Пример #32
0
def analyze_suggestions_2():
    fontpath = 'SourceHanSansCN-Regular.otf'

    content = open('Suggestions.txt', 'r').read()

    #移除
    removes = ['最好', '考虑', '可以', '孩子', '不能', '不要', '希望', '主要', '离家近', '学生']
    for rm in removes:
        jieba.del_word(rm)

    #添加
    adds = [
        '安全第一', '确保安全', '运行时间太长', '减少时间', '减少换乘', '缩短时间', '等车时间'
        '固定线路', '固定班次', '准时', '时间不要太长', '步行距离短', '准点', '票价合理', '公交车站', '公交站台',
        '站点', '小区门口', '附近'
    ]
    for add in adds:
        jieba.add_word(add)

    words = jieba.cut(content, cut_all=False)
    #print("Default Mode: " + "/ ".join(words))  # 精确模式

    cuted_words = ' '.join(words)
    print cuted_words

    wc = WordCloud(
        font_path=fontpath,  # 设置字体
        background_color="white",  # 背景颜色
        max_words=120,  # 词云显示的最大词数
        max_font_size=500,  # 字体最大值
        min_font_size=20,  #字体最小值
        random_state=42,  #随机数
        collocations=True,  #避免重复单词
        width=1600,
        height=1200,
        margin=10,  #图像宽高,字间距,需要配合下面的plt.figure(dpi=xx)放缩才有效
    )
    wc.generate(cuted_words)
    # print f
    # wc = WordCloud()
    # wc.generate(cuted_words)
    wc.to_file('./2.jpg')
Пример #33
0
def del_word_dict(word):
    '''
    向词典中删除单词
    '''
    jieba.del_word(word)
Пример #34
0
    cuttest("长春市长春节讲话")
    cuttest("结婚的和尚未结婚的")
    cuttest("结合成分子时")
    cuttest("旅游和服务是最好的")
    cuttest("这件事情的确是我的错")
    cuttest("供大家参考指正")
    cuttest("哈尔滨政府公布塌桥原因")
    cuttest("我在机场入口处")
    cuttest("邢永臣摄影报道")
    cuttest("BP神经网络如何训练才能在分类时增加区分度?")
    cuttest("南京市长江大桥")
    cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
    cuttest('长春市长春药店')
    cuttest('邓颖超生前最喜欢的衣服')
    cuttest('胡锦涛是热爱世界和平的政治局常委')
    cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
    cuttest('一次性交多少钱')
    cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
    cuttest('小和尚留了一个像大和尚一样的和尚头')
    cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
    cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
    cuttest('AT&T是一件不错的公司,给你发offer了吗?')
    cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
    cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
    cuttest('枪杆子中出政权')
    cuttest('张三风同学走上了不归路')
    cuttest('阿Q腰间挂着BB机手里拿着大哥大,说:我一般吃饭不AA制的。')
    cuttest('在1号店能买到小S和大S八卦的书,还有3D电视。')
    jieba.del_word('很赞')
    cuttest('看上去iphone8手机样式很赞,售价699美元,销量涨了5%么?')
Пример #35
0
# coding: utf-8
import jieba


def cuttest(sentence):
    seg_list = jieba.cut(sentence, cut_all=False, HMM=True)
    print "全模式:", "/ ".join(seg_list)

cuttest('我需要廉租房')
cuttest('据说这位语言学家去参加神马学术会议了')
cuttest('小明硕士毕业于中国科学院计算所,后在日本京都大学深造')
cuttest('他来到了网易杭研大厦')

# jieba.add_word('湖南')
# jieba.add_word('长沙市')
jieba.del_word('湖南长沙市')
cuttest('湖南长沙市天心区')

#
cuttest(u'自然语言处理')
Пример #36
0
Created on 2015年5月11日

@author: BFD474
'''

from __future__ import print_function, unicode_literals
import sys
import jieba
import jieba.posseg as pseg

sys.path.append( "../" )
jieba.load_userdict( "userdict.txt" )

jieba.add_word( '石墨烯' )
jieba.add_word( '凱特琳' )
jieba.del_word( '自定义词' )

test_sent = ( 
"李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿\n"
"例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类\n"
"「台中」正確應該不會被切開。mac上可分出「石墨烯」;此時又可以分出來凱特琳了。"
 )
words = jieba.cut( test_sent )
print( '/'.join( words ) )

print( "="*40 )

result = pseg.cut( test_sent )

for w in result:
    print( w.word, "/", w.flag, ", ", end = ' ' )
Пример #37
0
# -*- coding:utf-8 -*-
# User: rudy
# Time: 2015/11/01

import MySQLdb
import jieba
import pandas as pd
from pandas import Series,DataFrame

conn=MySQLdb.connect(host="115.28.149.242",user="******",passwd="***",db="test",charset="utf8")
cursor = conn.cursor()

jieba.load_userdict('./foobar.txt')
jieba.del_word('web')
sql = 'SELECT jd FROM lagou_source'
cursor.execute(sql)
result = cursor.fetchall()

ci_arr = {}
for item in result:
    temp_arr = jieba.lcut(item[0])
    for key in temp_arr:
        if len(key) >= 3:
            temp = key.lower()
            if temp in ci_arr:
                ci_arr[temp] += 1
            else:
                ci_arr[temp] = 1


is_first = True

test_sent = (
"李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿,上海市浦东区\n",
"例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类\n",
"「台中」正確應該不會被切開。mac上可分出「石墨烯」;此時又可以分出來凱特琳了。"
)

print test_sent[0].decode('utf-8')
words = jieba.cut(test_sent[0].decode('utf-8'))
print('/'.join(words))

print("="*40)

########### del word from dictionary
jieba.del_word('云计算')
jieba.add_word('浦东区')
words = jieba.cut(test_sent[0].decode('utf-8'))
print('/'.join(words))

"""
result = pseg.cut(test_sent)

for w in result:
    print(w.word, "/", w.flag, ", ", end=' ')

print("\n" + "="*40)

terms = jieba.cut('easy_install is great')
print('/'.join(terms))
terms = jieba.cut('python 的正则表达式是好用的')