示例#1
0
def test_json():
    pt.initialize()
    res = [pt.lcut(s) for s in sentences]
    j = json.dumps(res, cls=PairJSONEncoder)
    print(j)
    k = json.loads(j, cls=PairJSONDecoder)
    print(k)
示例#2
0
 def __init__(self, model='jieba'):
     self.model = model
     if model.lower() == 'jieba':
         import jieba.posseg as posseg
         posseg.initialize()
         self.segmentor = posseg.POSTokenizer(tokenizer=None)
     elif model.lower() == 'ictclas':
         import pynlpir
         pynlpir.open()
         self.segmentor = pynlpir
     else:
         raise NotImplementedError
示例#3
0
# jieba.load_userdict('/home/gyzhang/projects/cFrontEnd/data/dicts/dict_name.dict')
# from jieba import posseg
import time
# import pdb
import json, uuid, http.client, urllib.parse
import pycantonese as pc
from utils import *
import jyutping
from collections import OrderedDict
from linguistic_dict import Linguistic_DICT
from jieba import posseg
import tensorflow as tf
from aip import AipNlp
from pypinyin import pinyin, Style, style
# from hanziconv import HanziConv
posseg.initialize(dictionary='../data/dicts/simple_dict.txt')
ld = Linguistic_DICT()
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Configuration parameters for front end


class FrontEnd(object):
    def __init__(self, ):
        # project and dictionary file paths
        self.project_path = '/home/gyzhang/projects/cFrontEnd'
        self.wav_folder = '/home/gyzhang/speech_database/cuprosody/Wave'
        self.name = "cuprosody"
# @Time    : 18-9-28 下午1:47
# @Author  : duyongan
# @FileName: text_utils.py
# @Software: PyCharm
import re
from simple_pickle import utils
from text_process.text import Text
import nltk
import os
import numpy as np
from jieba import posseg

# from cppjieba_py import posseg
# from numba import jit

posseg.initialize()
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
here = os.path.dirname(__file__)
stopwords = utils.read_pickle(here + '/stopwords')
idf_map = utils.read_pickle(here + '/idf_map')


def text2sencents_zh(text):
    text = re.sub('\u3000|\r|\t|\xa0', '', text)
    text = re.sub('?”|!”|。”', '”', text)
    sentences = re.split("([。!?……])", text)
    sentences.append('')
    sentences = ["".join(i) for i in zip(sentences[0::2], sentences[1::2])]
    last_sentences = []
    for sentence in sentences:
        last_sentences += [
示例#5
0
# combine similarity scores
_similarity_smooth = lambda x, y, z, u: (x * y) + z - u
_flat_sum_array = lambda x: np.sum(x, axis=0)  # 分子
'''
tokenizer settings
'''
tokenizer_dict = os.path.join(curdir, 'data', 'vocab.txt')
if "SYNONYMS_WORDSEG_DICT" in ENVIRON:
    if os.path.exists(ENVIRON["SYNONYMS_WORDSEG_DICT"]):
        print("info: set wordseg dict with %s" % tokenizer_dict)
        tokenizer_dict = ENVIRON["SYNONYMS_WORDSEG_DICT"]
    else:
        print("warning: can not find dict at [%s]" % tokenizer_dict)

print(">> Synonyms load wordseg dict [%s] ... " % tokenizer_dict)
_tokenizer.initialize(tokenizer_dict)

# stopwords
_fin_stopwords_path = os.path.join(curdir, 'data', 'stopwords.txt')


def _load_stopwords(file_path):
    '''
    load stop words
    '''
    global _stopwords
    if sys.version_info[0] < 3:
        words = open(file_path, 'r')
    else:
        words = open(file_path, 'r', encoding='utf-8')
    stopwords = words.readlines()
示例#6
0
"""

import jieba
from jieba.kvdict import Kvdict
import sys


if __name__ == "__main__":

    freq_dict = Kvdict("word_freq.db")
    tag_dict = Kvdict("word_tag.db")
    jieba.dt.initialize()
    freq_dict.convert_value = lambda x: x if x is None else int(x)
    jieba.dt.FREQ = freq_dict
    from jieba import posseg as pg
    pg.initialize()
    pg.dt.word_tag_tab = tag_dict
    # import pdb
    # pdb.set_trace()
    pg.dt.add_word("上海电力股份有限公司",1000000,"n")
    for line in sys.stdin:
        line = line.strip()
        print line,jieba.dt.FREQ[line],
        for x, y in pg.cut(line.strip()):
            print (u"(%s,%s)" % (x, y)).encode("utf8"),
        print
    freq_dict.close()
    tag_dict.close()


import jieba.posseg as pseg
pseg.initialize()


def multilingual_sent_split(texts):
    print('\nOriginal texts: ', texts)
    lingual_split_sign = {'x', 'eng'}
    final_parts = []
    sub_part = []
    cuts = pseg.lcut(texts)
    for idx in range(len(cuts) - 1):
        # 如果当前位置的词语词性和下一个词词性相同,则把当前位置上的词添加进当前的sub_part中
        if (cuts[idx].flag in lingual_split_sign
                and cuts[idx + 1].flag in lingual_split_sign) or (
                    cuts[idx].flag not in lingual_split_sign
                    and cuts[idx + 1].flag not in lingual_split_sign):
            sub_part.append(cuts[idx].word)
        # 否则就应该把当前的sub_part添加进final_parts中,且要新建sub_part
        else:
            sub_part.append(cuts[idx].word)
            final_parts.append(sub_part)
            sub_part = []
    # 最后一个词如果和倒数第二个词词性相同,则把最后一个词添加进当前的sub_part中
    if (cuts[-1].flag in lingual_split_sign
            and cuts[-2].flag in lingual_split_sign) or (
                cuts[-1].flag not in lingual_split_sign
                and cuts[-2].flag not in lingual_split_sign):
        sub_part.append(cuts[-1].word)
    # 最后一个词如果和倒数第二个词词性不相同,则把最后一个词作为新的sub_part添加进final_parts中
    else:
        final_parts.append([cuts[-1].word])
示例#8
0
import logging
import re
import jieba
# jieba.initialize()  # (optional)
# jieba.load_userdict('/home/gyzhang/projects/cFrontEnd/data/dicts/dict_name.dict')
# from jieba import posseg
import time
# import pdb
import json,uuid,http.client, urllib.parse
import pycantonese as pc
import jyutping
from collections import OrderedDict
from linguistic_dict import Linguistic_DICT
from jieba import posseg
import tensorflow as tf
posseg.initialize(dictionary='../data/dicts/dict_name.dict')
ld = Linguistic_DICT()
logging.basicConfig(level=logging.INFO)

# Configuration parameters for front end

class CFrontEnd(object):
    def __init__(self, ):
        # project and dictionary file paths
        self.project_path = '/home/gyzhang/projects/cFrontEnd'
        self.wav_folder='/home/gyzhang/speech_database/cuprosody/Wave'
        self.name = "cuprosody"
        self.text_file = os.path.join(self.project_path, "exp", self.name, 'train/cn_text.txt')
        # this is test file out of this domain
        self.test_text_file = os.path.join(self.project_path,"exp",self.name,"train/cn_text_test.txt")
        self.mld = Linguistic_DICT()
示例#9
0

def test_textrank():
    s = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后," + \
        "吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产" + \
        "开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业" + \
        "收入0万元,实现净利润-139.13万元。"
    import jieba.analyse
    res = jieba.analyse.textrank(s, topK=20, withWeight=True, withFlag=True)
    #print(res)
    a = ['公司', '全资', '子公司', '吉林', '欧亚', '置业', '有限公司', '增资', '注册资本', '增加', '经营范围', '开发', '百货', '零售', '业务', '在建', '城市', '商业', '综合体', '项目', '实现', '营业', '收入', '净利润']
    b = ["实现", "零售", "注册资本", "营业", "置业", "城市", "业务", "欧亚", "开发", "百货", "增资", "收入", "子公司", "吉林", "项目", "全资", "商业", "经营范围", "综合体", "在建", "公司", "净利润", "有限公司"]
    print(set(a)-set(b))
    print(set(b)-set(a))


if __name__ == '__main__':
    pt.initialize()

    res = pt.lcut(sentences[10])
    res = pt.lcut(sentences[10], HMM=False)
    #print(res)

    #task_test_sentences_cut()
    #task_test_sentences_cut_noHMM()

    #task_test_book_cut()
    #task_test_book_cut_noHMM()
    test_textrank()