def load_all_model():
    """返回分词,词性标注,命名实体识别,依存解析等实例对象"""
    LTP_DATA_DIR = 'E:/MYGIT/Project/ltp_data'  # ltp模型目录的路径
    cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
    segmentor = Segmentor()  # 初始化实例
    segmentor.load_with_lexicon(cws_model_path, './temp_file/cut_external_dict/cut_external_dict')  # 加载模型

    LTP_DATA_DIR = 'E:/MYGIT/Project/ltp_data'  # ltp模型目录的路径
    pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
    postagger = Postagger()  # 初始化实例
    postagger.load_with_lexicon(pos_model_path, './temp_file/pos_external_dict/pos_external_dict')  # 加载模型

    LTP_DATA_DIR = 'E:/MYGIT/Project/ltp_data'  # ltp模型目录的路径
    ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型

    LTP_DATA_DIR = 'E:/MYGIT/Project/ltp_data'  # ltp模型目录的路径
    par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`
    parser = Parser()  # 初始化实例
    parser.load(par_model_path)  # 加载模型

    fname = r"E:/MYGIT/model/wiki_stopwords/wiki_word2vec.kv"
    # model_wv.save(fname)
    model_wv = KeyedVectors.load(fname, mmap='r')
    return [segmentor, postagger, recognizer, parser, model_wv]
示例#2
0
class Opinion(object):
    def __init__(self, Dsent, industry_id):
        self.industry_id = industry_id
        self.Dsent = Dsent
        self.postagger = Postagger()  # 初始化实例
        self.postagger.load_with_lexicon(pos_model_path,
                                         '%s/conf/posttags.txt' % dir_path)
        self.sql = mysqls()
        self.opinionword = read_opinion(self.industry_id)
        self.n_v = []

    def cut_word(self, sents):
        # 分词
        words = [i.encode('utf-8', 'ignore')
                 for i in norm_cut(sents)]  # HMM=False
        return words

    def word_sex(self, ):
        # 获取词性
        postags = list(self.postagger.postag(self.words))  # 词性标注
        num = 0
        #副词或者名词后面一个词
        for tag in postags:
            if tag in ['d']:
                if num + 1 < len(postags):
                    if num != 0 and postags[num + 1] in ['n', 'v']:
                        if self.words[num+1] not in self.opinionword \
                            and len(self.words[num + 1].decode('utf-8','ignore')) > 1:
                            self.n_v.append(self.words[num + 1])
            #动词或者n词
            if tag in ['a', 'i', 'b']:
                if self.words[num] not in self.opinionword\
                        and len(self.words[num].decode('utf-8','ignore')) > 1:
                    self.n_v.append(self.words[num])
            num += 1
        return postags

    def prepare(self, ):
        for id, sentences in self.Dsent.items():
            split_sentence = re.split(
                ur'[,,()()、: …~?。!. !?]?',
                sentences.decode('utf-8', 'ignore').strip())
            for sent in split_sentence:
                self.words = self.cut_word(sent.encode('utf-8', 'ignore'))
                self.postags = self.word_sex()
                cword = Counter(self.n_v)

                lresult = heapq.nlargest(500,
                                         cword.items(),
                                         key=lambda x: x[1])
                # lword = []
                # for rg in lresult:
                #     w, n = rg
                #     lword.append(w)
                # self.sql.insert(self.industry_id, lword)
        self.postagger.release()  # 释放模型
        # self.parser.release()  # 释放模型
        # outfile.close()
        return lresult
def new_relation_find(words, sentence):
    """ 新关系发现

    :param words:
    :param sentence:
    :return:
    """
    # 存放三元组的字典
    tuple_dict = dict()
    index0 = -1
    index1 = -1
    bool = False
    for entity_word in entity_words:
        if sentence.find(entity_word) != -1:
            if tuple_dict:
                # 返回为true说明有重复部分
                if has_same(tuple_dict[index0], entity_word):
                    continue
                index1 = sentence.find(entity_word)
                tuple_dict[index1] = entity_word
                bool = True
                break
            else:
                index0 = sentence.find(entity_word)
                tuple_dict[index0] = entity_word
    if bool is False:
        return "", "", ""
    # 排序结果为list
    # tuple_dict = sorted(tuple_dict.items(), key=lambda d: d[0])
    words = "/".join(words).split("/")
    for key, value in tuple_dict.items():
        tuple_word = value
        words = init_words(tuple_word, words)
    # 对于已经重构的词进行词标注
    postagger = Postagger()  # 初始化实例
    pos_model_path = os.path.join(LTP_DATA_DIR,
                                  'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
    postagger.load_with_lexicon(pos_model_path, 'data/postagger.txt')  # 加载模型
    postags = postagger.postag(words)  # 词性标注
    print('\t'.join(postags))
    postagger.release()  # 释放模型
    # 发现新关系
    relation_word = ""
    index_word = 0
    for index, postag in enumerate('\t'.join(postags).split('\t')):
        index_word += len(words[index])
        if index_word >= len(sentence):
            break
        if postag == 'v' and index_word - min(index0, index1) <= 2 and max(index0, index1) - index_word <= 2 \
                and not has_same(tuple_dict[index0], words[index]) and not has_same(tuple_dict[index1],
                                                                                    words[index]) \
                and words[index] not in wrong_relation:
            relation_word = words[index]
            break
    if relation_word == "":
        return "", "", ""
    return tuple_dict[min(index0,
                          index1)], tuple_dict[max(index0,
                                                   index1)], relation_word
示例#4
0
def postaggers(words):
    postagger = Postagger()  # 初始化实例
    pos_model_path = os.path.join(LTP_DATA_DIR,
                                  'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
    postagger.load_with_lexicon(pos_model_path, 'data/postagger.txt')  # 加载模型
    postags = postagger.postag(words)  # 词性标注
    # print('\t'.join(postags))
    postagger.release()  # 释放模型
    return postags
示例#5
0
class Model:
    # 模型类
    def __init__(self):
        self.segmentor = None
        self.postagger = None
        self.recognizer = None
        self.parser = None
        self.model_wv = None

    def load_model(self):
        """返回分词,词性标注,命名实体识别,依存解析等实例对象"""
        LTP_DATA_DIR = Myconfig.get_path('ltp_data')
        LTP_TEMP_DIR = Myconfig.get_path('temp_file')
        assert LTP_DATA_DIR
        assert LTP_TEMP_DIR
        cws_model_path = os.path.join(LTP_DATA_DIR,
                                      'cws.model')  # 分词模型路径,模型名称为`cws.model`
        cut_temp_path = os.path.join(LTP_TEMP_DIR,
                                     'cut_external_dict/cut_external_dict')
        self.segmentor = Segmentor()  # 初始化实例
        self.segmentor.load_with_lexicon(cws_model_path, cut_temp_path)  # 加载模型

        pos_model_path = os.path.join(LTP_DATA_DIR,
                                      'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
        pos_temp_path = os.path.join(LTP_TEMP_DIR,
                                     'pos_external_dict/pos_external_dict')
        self.postagger = Postagger()  # 初始化实例
        self.postagger.load_with_lexicon(pos_model_path, pos_temp_path)  # 加载模型

        ner_model_path = os.path.join(
            LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
        self.recognizer = NamedEntityRecognizer()  # 初始化实例
        self.recognizer.load(ner_model_path)  # 加载模型

        par_model_path = os.path.join(
            LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`
        self.parser = Parser()  # 初始化实例
        self.parser.load(par_model_path)  # 加载模型

        fname = Myconfig.get_path('vec.kv')  # 或取模型目录
        assert fname
        # model_wv.save(fname)
        self.model_wv = KeyedVectors.load(fname, mmap='r')

    def release_all_model(self):
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()
        del (self.model_wv)
        _ = gc.collect()
        _ = gc.collect()
示例#6
0
 def __init__(self, lexicon_path='./data/lexicon'):
     postagger = Postagger()
     postagger.load_with_lexicon(pos_model_path, lexicon_path)
     parser = Parser()
     parser.load(par_model_path)
     segmentor = Segmentor()
     segmentor.load_with_lexicon(cws_model_path, lexicon_path)
     recognizer = NamedEntityRecognizer()
     recognizer.load(ner_model_path)
     self.postagger = postagger
     self.parser = parser
     self.segmentor = segmentor
     self.recognizer = recognizer
示例#7
0
def SrlFunction(contents):
    from pyltp import Segmentor
    segmentor = Segmentor()  # 初始化实例
    # segmentor.load(cws_model_path)  # 加载模型
    segmentor.load_with_lexicon(cws_model_path,
                                'E:\\ltp_data_v3.4.0\\personal_seg.txt')
    words = segmentor.segment(contents)  # 分词
    k = 1
    for word in words:
        print(word + str(k) + '  ', end='')
        k = k + 1
    print('\n')
    # print('\t'.join(words))
    segmentor.release()  # 释放模型
    wordslist = list(words)

    from pyltp import Postagger
    postagger = Postagger()
    # postagger.load(pos_model_path)
    postagger.load_with_lexicon(pos_model_path,
                                'D:\\ltp_data_v3.4.0\\personal_pos.txt')
    postags = postagger.postag(wordslist)
    print('\t'.join(postags))
    postagger.release()

    # wordslist = ['人力资源社会保障局','主管','医疗保险','工作']
    # postags = ['n','v','n','v']

    from pyltp import Parser
    parser = Parser()  # 初始化实例
    parser.load(par_model_path)  # 加载模型
    arcs = parser.parse(wordslist, postags)  # 句法分析
    print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
    parser.release()  # 释放模型

    from pyltp import SementicRoleLabeller
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load(srl_model_path)  # 加载模型
    # arcs 使用依存句法分析的结果
    roles = labeller.label(wordslist, postags, arcs)  # 语义角色标注

    # 打印结果
    for role in roles:
        print(
            role.index, "".join([
                "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                for arg in role.arguments
            ]))
    labeller.release()  # 释放模型
示例#8
0
 def __init__(self, lexicon_path='./data/lexicon'):
     postagger = Postagger()
     postagger.load_with_lexicon(pos_model_path, lexicon_path)
     parser = Parser()
     parser.load(par_model_path)
     # segmentor = Segmentor()
     # segmentor.load_with_lexicon(cws_model_path, lexicon_path)
     recognizer = NamedEntityRecognizer()
     recognizer.load(ner_model_path)
     self.postagger = postagger
     self.parser = parser
     # self.segmentor = segmentor
     self.recognizer = recognizer
     jieba.load_userdict(lexicon_path)
     jieba.enable_parallel(12)
示例#9
0
class Ltp:
    "https://pyltp.readthedocs.io/zh_CN/latest/"
    def __init__(self, seg=True, pos=False, ner=False, parse=False,
                 seg_lexicon_path=None, pos_lexicon_path=None):
        cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
        pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
        ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径
        par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型

        if seg:
            self.segmentor = Segmentor()               #分词
            if seg_lexicon_path:
                self.segmentor.load_with_lexicon(cws_model_path,
                                                 seg_lexicon_path)
            else:
                self.segmentor.load(cws_model_path)

        if pos:
            # 输入分词结果
            self.postagger = Postagger()               #词性标注
            self.postagger.load(pos_model_path)
            if pos_lexicon_path:
                self.postagger.load_with_lexicon(pos_model_path,
                                                 pos_lexicon_path)
            else:
                self.postagger.load(pos_model_path)

        if ner:
            # 输入分词和标注结果
            self.ner = NamedEntityRecognizer()  #命名主体识别
            self.ner.load(ner_model_path)

        if parse:
            # 输入分词和标注结果
            self.parser = Parser()                     #依存分析
            self.parser.load(par_model_path)

    def release(self):
        try:
            self.segmentor.release()
            self.postagger.release()
            self.ner.release()
            self.parser.release()
        except AttributeError:
            pass

    def __del__(self):
        self.release()
示例#10
0
def posttagger(words):
    postagger = Postagger()  # 初始化实例
    #postagger.load('E:\\python2.7 install\\pyltp-master\\ltp_data\\pos.model')  # 加载模型
    postagger.load_with_lexicon(pos_model_path, 'D:\\LTP\\ltp_data\\pos.txt')

    #postagger = settings.POSTAGGER
    #if settings.POSTAGGER is None:
    #    settings.POSTAGGER = Postagger()
    #    settings.POSTAGGER.load_with_lexicon('/mnt/hgfs/ubuntu-share/pyltp-master/ltp_data/pos.model','/mnt/hgfs/ubuntu-share/pyltp-master/ltp_data/pos.txt')

    #postagger  = settings.POSTAGGER

    postags = postagger.postag(words)  # 词性标注
    print "词性标注:\n"
    for word, tag in zip(words, postags):
        print word + '/' + tag

    postagger.release()  # 释放模型
    return postags
示例#11
0
def work():
    segmentor = Segmentor()  # 初始化实例(分词,词性,Ner)
    postagger = Postagger()
    recognizer = NamedEntityRecognizer()
    cws_model_path = "D:\\Academic\\LTP\\3.4.0\ltp_data_v3.4.0\cws.model"
    pos_model_path = "D:\\Academic\\LTP\\3.4.0\ltp_data_v3.4.0\pos.model"
    segmentor.load_with_lexicon(cws_model_path, "\\dictionary.txt")  # 加载模型和词典
    postagger.load_with_lexicon(pos_model_path, "\\dictionary.txt")
    recognizer.load('D:\\Academic\\LTP\\3.4.0\\ltp_data_v3.4.0\\ner.model')

    stopwords = stopwordslist('stoplist.txt')

    for line in Input.readlines():
        words = segmentor.segment(line)  # 分词
        words_list = list(words)
        word_list = []
        for word in words_list:
            if word not in stopwords :
                print(''.join(word) + ' ', end='')
                word_list.append(word)
        print()

        postags = postagger.postag(word_list)  # 词性分析
        postags_list = list(postags)
        for word, tag in zip(word_list, postags_list):
          print(word + ' /' + tag)
        print()

        netags = recognizer.recognize(word_list, postags)  # 命名实体识别
        for word, tag in zip(word_list, netags):
            if tag != 'O':
                print(word + '/' + tag)
        print()

    postagger.release()  # 释放模型
    segmentor.release()
    recognizer.release()
示例#12
0
sents = SentenceSplitter.split(text)  # 分句

segmentor = Segmentor()  # 初始化实例
#segmentor.load(cws_model_path)  # 加载模型
segmentor.load_with_lexicon(cws_model_path,
                            'D:\python\ltp_data_v3.4.0\lexicon')
segmentor_2 = Segmentor()  # 初始化实例
# #segmentor.load(cws_model_path)  # 加载模型
segmentor_2.load_with_lexicon(
    cws_model_path, 'D:\python\ltp_data_v3.4.0\lexicon_label'
)  # 加载模型#segmentor.load_with_lexicon(cws_model_path, 'D:\python\毕业设计\lexicon')  # 加载模型,第二个参数是您的外部词典文件路径

postagger = Postagger()  # 初始化实例
postagger_2 = Postagger()  # 初始化实例
postagger.load_with_lexicon(pos_model_path,
                            'D:\python\ltp_data_v3.4.0\lexicon_1')  # 加载模型
postagger_2.load_with_lexicon(
    pos_model_path, 'D:\python\ltp_data_v3.4.0\lexicon_label_1')  # 加载模型
recognizer = NamedEntityRecognizer()  # 初始化实例
recognizer_2 = NamedEntityRecognizer()  # 初始化实例
recognizer.load(ner_model_path)  # 加载模型
recognizer_2.load(ner_model_path)  # 加载模型
parser = Parser()  # 初始化实例
parser.load(par_model_path)  # 加载模型
labeller = SementicRoleLabeller()  # 初始化实例
labeller.load(srl_model_path)  # 加载模型


def is_name_entity(entity):
    return entity != 'O'
示例#13
0
class NlpLtp():
    def __init__(self):
        print('Load pyplt models...')
        start = time.time()
        self.segmentor = Segmentor()  # 初始化实例
        self.segmentor.load_with_lexicon(cws_model_path, user_dict_seg)  # 加载模型
        self.postagger = Postagger()
        self.postagger.load_with_lexicon(pos_model_path, user_dict_pos)

        #self.parser = Parser() # 初始化实例
        #self.parser.load(par_model_path)  # 加载模型
        #self.labeller = SementicRoleLabeller() # 初始化实例
        #self.labeller.load(srl_model_path)  # 加载模型
        self.recognizer = NamedEntityRecognizer()  # 初始化实例
        self.recognizer.load(ner_model_path)
        self.nerdict = dict()
        elapsed = time.time() - start
        print('Load pyplt models finished in ', elapsed)

    # 释放模型
    def __del__(self):
        print('Release pyplt models...')
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        #self.parser.release()
        #self.labeller.release()
        print('Release pyplt models finished.')

    def sentence(self, content):
        return SentenceSplitter.split(content)

    def segment(self, text):
        return self.segmentor.segment(text)

    def postag(self, wordlist):
        return self.postagger.postag(wordlist)

    #def parse(self, wordlist, postags):
    #    return self.parser.parse(wordlist, postags)

    #def role_label(self, wordlist, postags, arcs):
    #    return self.labeller.label(wordlist, postags, arcs)
    def get_keywords(self, txt):
        words = pltobj.segment(txt)
        postags = pltobj.postag(words)
        ners = pltobj.ner(words, postags)
        keywords = list()
        for k, val in ners.items():
            keywords.append(k)
        return keywords

    def add_entity(self, word, tag):
        if word in self.nerdict:
            count = self.nerdict[word][1]
        else:
            count = 0
        self.nerdict[word] = [tag, count + 1]

    #命名实体结果如下,ltp命名实体类型为:人名(Nh),地名(NS),机构名(Ni);
    #ltp采用BIESO标注体系。
    #B表示实体开始词,I表示实体中间词,E表示实体结束词,S表示单独成实体,O表示不构成实体。
    def ner(self, wordlist, postags):
        ners = self.recognizer.recognize(wordlist, postags)
        for i in range(0, len(ners)):
            #print( wordlist[i], postags[i], ners[i] )
            if postags[i] in NOUN_LIST:
                word = wordlist[i].strip()
                if len(word) > 1:
                    self.add_entity(word, postags[i])

            if ners[i] == 'S-Ns' or ners[i] == 'S-Nh' or ners[i] == 'S-Ni':
                word = wordlist[i].strip()
                if len(word) > 1 or word in PROVINCE_NAME:  #实体名长度大于1,保存词性
                    self.add_entity(word, postags[i])
        return self.nerdict

    def clean_ner(self):
        self.nerdict = dict()

    def get_ner(self):
        return self.nerdict
示例#14
0
from similarity import *

sub_pattern, con_pattern, obj_pattern = 0, 0, 0

cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')
parser_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')
pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
pisrl_model_path = os.path.join(LTP_DATA_DIR, 'pisrl.model')

role_model = svm_load_model("../data/models/role.model")
segmentor = Segmentor()
segmentor.load_with_lexicon(cws_model_path, '../data/configure/lexicon.txt')

postagger = Postagger()
postagger.load_with_lexicon(pos_model_path, '../data/configure/pos.txt')

parser = Parser()
parser.load(parser_model_path)

recognizer = NamedEntityRecognizer()
recognizer.load(ner_model_path)

labeller = SementicRoleLabeller()
labeller.load(pisrl_model_path)


class Record():
    def __init__(self):

        self.original_sentence = ''
示例#15
0
class LtpParser():
    def __init__(self):
        LTP_DIR = "/home/ubuntu/model/ltp/ltp_data_v3.4.0"
        self.segmentor = Segmentor()
        self.segmentor.load_with_lexicon(
            os.path.join(LTP_DIR, "cws.model"),
            os.path.join(LTP_DIR, "word_dict.txt"))  #加载外部词典

        self.postagger = Postagger()
        self.postagger.load_with_lexicon(
            os.path.join(LTP_DIR, "pos.model"),
            os.path.join(LTP_DIR, "n_word_dict.txt"))  #加载外部词典

        # self.parser = Parser()
        # self.parser.load(os.path.join(LTP_DIR, "parser.model")) #依存句法分析

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))  #实体识别

        # #加载停词
        # with open(LTP_DIR + '/stopwords.txt', 'r', encoding='utf8') as fread:
        #     self.stopwords = set()
        #     for line in fread:
        #         self.stopwords.add(line.strip())

    '''把实体和词性给进行对应'''

    def wordspostags(self, name_entity_dist, words, postags):
        pre = ' '.join(
            [item[0] + '/' + item[1] for item in zip(words, postags)])
        post = pre
        for et, infos in name_entity_dist.items():
            if infos:
                for info in infos:
                    post = post.replace(' '.join(info['consist']),
                                        info['name'])
        post = [
            word for word in post.split(' ')
            if len(word.split('/')) == 2 and word.split('/')[0]
        ]
        words = [tmp.split('/')[0] for tmp in post]
        postags = [tmp.split('/')[1] for tmp in post]

        return words, postags

    '''根据实体识别结果,整理输出实体列表'''

    def entity(self, words, netags, postags):
        '''
        :param words: 词
        :param netags: 实体
        :param postags: 词性
        :return:
        '''
        name_entity_dict = {}
        name_entity_list = []
        place_entity_list = []
        organization_entity_list = []
        ntag_E_Nh = ""
        ntag_E_Ni = ""
        ntag_E_Ns = ""
        index = 0
        for item in zip(words, netags):
            word = item[0]
            ntag = item[1]
            if ntag[0] != "O":
                if ntag[0] == "S":
                    if ntag[-2:] == "Nh":
                        name_entity_list.append(word + '_%s ' % index)
                    elif ntag[-2:] == "Ni":
                        organization_entity_list.append(word + '_%s ' % index)
                    else:
                        place_entity_list.append(word + '_%s ' % index)
                elif ntag[0] == "B":
                    if ntag[-2:] == "Nh":
                        ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index
                    elif ntag[-2:] == "Ni":
                        ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index
                    else:
                        ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index
                elif ntag[0] == "I":
                    if ntag[-2:] == "Nh":
                        ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index
                    elif ntag[-2:] == "Ni":
                        ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index
                    else:
                        ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index
                else:
                    if ntag[-2:] == "Nh":
                        ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index
                        name_entity_list.append(ntag_E_Nh)
                        ntag_E_Nh = ""
                    elif ntag[-2:] == "Ni":
                        ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index
                        organization_entity_list.append(ntag_E_Ni)
                        ntag_E_Ni = ""
                    else:
                        ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index
                        place_entity_list.append(ntag_E_Ns)
                        ntag_E_Ns = ""
            index += 1
        name_entity_dict['nhs'] = self.modify(name_entity_list, words, postags,
                                              'nh')
        name_entity_dict['nis'] = self.modify(organization_entity_list, words,
                                              postags, 'ni')
        name_entity_dict['nss'] = self.modify(place_entity_list, words,
                                              postags, 'ns')
        return name_entity_dict

    def modify(self, entity_list, words, postags, tag):
        modify = []
        if entity_list:
            for entity in entity_list:
                entity_dict = {}
                subs = entity.split(' ')[:-1]
                start_index = subs[0].split('_')[1]
                end_index = subs[-1].split('_')[1]
                entity_dict['stat_index'] = start_index
                entity_dict['end_index'] = end_index
                if start_index == entity_dict['end_index']:
                    consist = [
                        words[int(start_index)] + '/' +
                        postags[int(start_index)]
                    ]
                else:
                    consist = [
                        words[index] + '/' + postags[index]
                        for index in range(int(start_index),
                                           int(end_index) + 1)
                    ]
                entity_dict['consist'] = consist
                entity_dict['name'] = ''.join(
                    tmp.split('_')[0] for tmp in subs) + '/' + tag
                modify.append(entity_dict)
        return modify

    '''词性和实体'''

    def post_ner(self, words):
        postags = list(self.postagger.postag(words))
        # words_filter =[]
        # postags = []
        # for word, postag in zip(words, self.postagger.postag(words)):
        #     if 'n' in postag:
        #         postags.append(postag)
        #         words_filter.append(word)
        nerags = self.recognizer.recognize(words, postags)
        return postags, nerags

    def parser_process(self, sentence):
        words = list(self.segmentor.segment(sentence))
        post, ner = self.post_ner(words)  # 词性和实体
        name_entity_dist = self.entity(words, ner, post)
        words, postags = self.wordspostags(name_entity_dist, words, post)
        return words, postags
示例#16
0
from pyltp import Parser
from pyltp import Segmentor
from pyltp import Postagger
import networkx as nx
import pylab
import re
import matplotlib.pyplot as plt
from pylab import mpl
from graphviz import Digraph
import numpy as np

# 初始化实例
postagger = Postagger()

pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
postagger.load_with_lexicon(postagger, '../data/user_dict.txt')  # 加载模型
segmentor = Segmentor()  # 初始化实例
segmentor.load_with_lexicon(cws_model_path, '../data/user_dict.txt')  # 加载模型

SEN_TAGS = [
    "SBV", "VOB", "IOB", "FOB", "DBL", "ATT", "ADV", "CMP", "COO", "POB",
    "LAD", "RAD", "IS", "HED"
]


def parse(s, isGraph=False):
    """
    对语句进行句法分析,并返回句法结果
    """
    tmp_ner_dict = {}
    num_lst = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十']
示例#17
0
class pyltp_worker(object):

    #初始化,创建实例,加载基础模型
    def __init__(self, model_path):
        self.LTP_MODEL_DIR = model_path
        self.segmentor = Segmentor()  #分词
        self.postagger = Postagger()  #词性标注
        self.recognizer = NamedEntityRecognizer()  #命名实体识别
        self.parser = Parser()  #依存句法分析
        self.load_model()

    #加载基础模型
    def load_model(self):
        self.cws_model_path = os.path.join(self.LTP_MODEL_DIR,
                                           'cws.model')  #分词模型路径
        self.pos_model_path = os.path.join(self.LTP_MODEL_DIR,
                                           'pos.model')  #词性标注模型路径
        self.ner_model_path = os.path.join(self.LTP_MODEL_DIR,
                                           'ner.model')  #命名实体识别模型路径
        self.par_model_path = os.path.join(self.LTP_MODEL_DIR,
                                           'parser.model')  #依存句法分析模型路径
        self.segmentor.load(self.cws_model_path)  #加载cws模型
        self.postagger.load(self.pos_model_path)  #加载pos模型
        self.recognizer.load(self.ner_model_path)  #加载ner模型
        self.parser.load(self.par_model_path)  #加载parser模型

    #释放实例
    def end(self):
        self.segmentor.release()  #分词
        self.postagger.release()  #词性标注
        self.recognizer.release()  #命名实体识别
        self.parser.release()  #依存句法分析

    #加入自定义词典

    def add_cws_userdict(self, lexicon_path):
        self.segmentor.load_with_lexicon(lexicon_path)

    def add_pos_userdict(self, lexicon_path):
        self.postagger.load_with_lexicon(lexicon_path)

    def add_ner_userdict(self, lexicon_path):
        self.recognizer.load_with_lexicon(lexicon_path)

    def add_par_userdict(self, lexicon_path):
        self.parser.load_with_lexicon(lexicon_path)

    #分句。按照标点符号来分,返回句子列表。
    def sentsplit(self, text):
        sentences = SentenceSpliter.split(text)
        sentences_list = list(sentences)
        return sentences_list

    #分词。返回词列表。
    def cws(self, text):
        words = self.segmentor.segment(text)
        words_list = list(words)
        return words_list

    #词性标注。返回词性标注列表。
    def pos(self, words):
        postags = self.postagger.postag(words)
        postags_list = list(postags)
        return postags_list

    #命名实体识别。返回命名实体类型列表。
    def ner(self, words, postags):
        nertags = self.recognizer.recognize(words, postags)
        nertags_list = list(nertags)
        return nertags_list

    #依存句法分析。
    def par(self, words, postags):
        arcs = self.parser.parse(words, postags)
        pr_list = []
        word_list = []
        word_pos_list = []
        source_list = []
        source_pos_list = []
        relation_list = []
        for i, k in enumerate(arcs):
            word = words[i]
            word_pos = postags[i]
            source = words[k.head - 1]
            source_pos = postags[k.head - 1]
            relation = k.relation
            word_list.append(word)
            word_pos_list.append(word_pos)
            source_list.append(source)
            source_pos_list.append(source_pos)
            relation_list.append(relation)
            pr_list.append([word, word_pos, source, source_pos, relation])
        df_list = [
            word_list, word_pos_list, source_list, source_pos_list,
            relation_list
        ]
        return pr_list, df_list, arcs
示例#18
0
corpus_test = X_test_sentence['ner'].map(filtered_segment).tolist()

# 提取tfidf特征
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(corpus_train)
X_test = vectorizer.transform(corpus_test)
print(X_train.shape)
print(X_test.shape)

# 提取句法特征
# 1、企业实体间距离
# 2、企业实体间句法距离
# 3、企业实体分别和关键触发词的距离
# 4、实体的依存关系类别
postagger = Postagger()
postagger.load_with_lexicon('../data/ltp-models/ltp_data_v3.4.0/pos.model',
                            '../data/user_dict.txt')  # 加载模型
segmentor = Segmentor()
segmentor.load_with_lexicon('../data/ltp-models/ltp_data_v3.4.0/cws.model',
                            '../data/user_dict.txt')  # 加载模型


def shortest_path(arcs_ret, source, target):
    """
    求出两个词最短依存句法路径,不存在路径返回-1
    arcs_ret:句法分析结果表格
    source:实体1
    target:实体2
    """
    G = nx.DiGraph()
    # 为这个网络添加节点...
    for i in list(arcs_ret.index):
class ExtraModel:
    def __init__(self):
        self.nr_table_name = ''
        self.nrTable = None
        # 词性标注
        pos_model_path = os.path.join(os.path.dirname(__file__), '../data/ltp_data/pos.model')
        self.postagger = Postagger()
        # self.postagger.load(pos_model_path)

        # 依存句法分析
        par_model_path = os.path.join(os.path.dirname(__file__), '../data/ltp_data/parser.model')
        self.parser = Parser()
        self.parser.load(par_model_path)

    def getNrTable(self, book):
        f = open(NR_TABLE_PATH_BASE+book+'.nps.txt')
        data = f.read().splitlines()[1:]
        NrTable = []
        for it in data:
            nr = it.split(',')[0]
            NrTable.append(nr)
        return NrTable

    def loadNrTable(self, book, nrTable=None):
        if(nrTable==None):nrTable = self.getNrTable(book)
        path = 'data/npList/' + book + '.nps.table'
        if(os.path.exists(path)):
            self.postagger.load_with_lexicon('data/ltp_data/pos.model', path)
        else:
            out = open('data/npList/' + book, 'w')
            for nr in nrTable:
                out.write(nr + ' nh\n')
            out.close()
            self.postagger.load_with_lexicon('data/ltp_data/pos.model', path)
        for nr in nrTable:jieba.add_word(nr)
        self.nrTable = nrTable
        self.nr_table_name = book

    def InputWords(self, words):
        postags = self.postagger.postag(words)
        arcs = self.parser.parse(words, postags)
        rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
        relation = [arc.relation for arc in arcs]  # 提取依存关系
        print(words)
        print(list(postags))
        heads = [-1 if id == 0 else id - 1 for id in rely_id]  # 匹配依存父节点词语
        for i, v in enumerate(heads):
            if v == -1:
                return relation, heads, i, postags
        return None

    def addCooNode(self, s, word, sbvlink):
        if(s.coo):
            self.addCooNode(s.coo, word, sbvlink)
        else:
            verb = Sentence(word)
            if(word in sbvlink):
                verb.sbv = sbvlink[word]
            s.coo = verb
        return None

    def addVob(self, s, head, word):
        if(s!=None):
            if(s.v == head):
                s.vob = word
            else:
                self.addVob(s.coo, head, word)
        return None

    def getMainSentence(self, s):
        s = self.DialogFliter(s)
        words = list(jieba.cut(s))
        if(not words):
            return []
        relation, heads, root, postags = self.InputWords(words)
        # print(list(postags))
        stack = [words[root]]
        res = {}
        cooLink = {}
        sbvLink = {}
        n = len(words)
        while(stack):
            hed = stack.pop(0)
            res[words[hed]] = Sentence(words[hed])
            for i in range(n):
                r = relation[i]
                if(r == 'COO'):
                    h = heads[i]
                    w = words[i]
                    if (h in cooLink):
                        h = cooLink[h]
                    if(h == hed and h!=w):
                        self.addCooNode(res[words[h]],w, sbvLink)
                        cooLink[w] = h
                if(r == 'VOB'):
                    h = heads[i]
                    if (h in cooLink):
                        h = cooLink[h]
                    if(h==hed):
                        w = words[i]
                        self.addVob(res[words[h]], words[heads[i]], w)
                        if(postags[i]=='v' and (w not in res)):
                            stack.append(w)
                if(r == 'SBV'):
                    h = heads[i]
                    if(h in cooLink):h = cooLink[h]
                    if(h == hed):
                        verb = res[words[h]]
                        while(verb.v!=heads[i]):verb = verb.coo
                        if(verb.sbv):
                            verb.sbv += words[i]
                        else:
                            verb.sbv = words[i]
                    else:
                        sbvLink[heads[i]] = words[i]
        text = []
        for it in res:
            text.append(self.getSentence(res[it]).replace('\n',''))
        # for i in range(len(words)):
        #     print(relation[i] + '(' + words[i] + ', ' + str(heads[i]) + ')',end='+')
        # print('')
        # print(s, text)
        return text

    def DialogFliter(self, s):
        res = re.sub('[\'\"‘“].*?[’”\'\"]','',s)
        # print(res)
        pos  = re.finditer('::',res)
        content = []
        for it in pos:
            content.append(it.span())
        if(len(content)<2):
            res = res[content[0][1]:]
        else:
            res = res[content[0][1]:content[1][0]]
        return res

    def readCoo(self, s):
        text = ''
        if(s):
            if s.sbv:
                text = s.sbv + text
            text = text + s.v
            if(s.vob):
                text = text + s.vob
            return [text] + self.readCoo(s.coo)
        return [text]

    def getSentence(self, sentence):
        text = []
        # print(sentence)
        if(sentence.vob):
            text.append(sentence.v+sentence.vob)
        else:
            text.append(sentence.v)
        # print(sentence.coo, sentence.v)
        if(sentence.coo):
            text =  text + self.readCoo(sentence.coo)
        text = ','.join(text)
        if (sentence.sbv):
            text = sentence.sbv+text
        else:
            text = '[unknown]' + text
        return text

    def getMain(self, text, book):
        nr = []
        words = list(jieba.cut(text))
        relation, heads, root, postags = self.InputWords(words)
        queue = [root]
        N = len(words)
        while (queue):
            current = queue.pop(0)
            for i in range(N):
                if (heads[i] == current):
                    if (relation[i] == 'SBV' or relation[i] == 'ATT'):
                        if (relation[i] == 'SBV'):
                            nr.append((words[i],re.search(words[i],text).start(),postags[i],1))
                        else:
                            if (words[i] in self.nrTable):
                                nr.append((words[i],re.search(words[i],text).start(),postags[i],1))
                    elif (relation[i] == 'COO'):
                        queue.append(i)
        return nr
示例#20
0
class LTP:
    def __init__(
        self,
        ltp_data_path=None,
        seg_lexicon=None,
        pos_lexicon=None,
    ):
        if not ltp_data_path:
            raise ValueError('请指定ltp用到的模型所在路径!!!')

        self.ltp_data_path = ltp_data_path  # ltp模型目录的路径
        self._cws_model_path = os.path.join(
            self.ltp_data_path, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
        self._pos_model_path = os.path.join(
            self.ltp_data_path, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
        self._ner_model_path = os.path.join(
            self.ltp_data_path, 'parser.model')  # 命名实体识别模型路径,模型名称为`pos.model`

        self._segmentor = Segmentor()  # 初始化实例
        if seg_lexicon:
            self._segmentor.load_with_lexicon(
                self._cws_model_path, seg_lexicon)  # 加载模型,第二个参数是您的外部词典文件路径
        else:
            self._segmentor.load(self._cws_model_path)

        self._postagger = Postagger()  # 初始化实例
        if pos_lexicon:
            self._postagger.load_with_lexicon(
                self._pos_model_path, pos_lexicon)  # 加载模型,第二个参数是您的外部词典文件路径
        else:
            self._postagger.load(self._pos_model_path)

        self._recognizer = NamedEntityRecognizer()  # 初始化实例
        self._recognizer.load(self._ner_model_path)  # 加载模型

    def cut(self, text):
        return self._segmentor.segment(text)

    def pos(self, text):
        words = self.cut(text)
        postags = self._postagger.postag(words)

        return zip(words, postags)

    def ner(self, text):
        """
        命名实体识别,提供三种命名识别,PER人名、LOC地名、ORG机构名
        :param text:
        :return:
        """
        # Nh代表人名, Ni代表机构名,Ns代表地点名字
        ner_dict = {'Nh': [], 'Ni': [], 'Ns': []}
        words = self.cut(text)
        postags = self._postagger.postag(words)
        nertags = self._recognizer.recognize(words, postags)

        ner_tmp = []
        for i, tag in enumerate(nertags):
            if tag == 'O':
                continue
            if tag.startswith('S'):
                tag = tag.split('-')[-1]
                ner_dict[tag].append(words[i])
            elif tag.startswith('B') or tag.startswith('I'):
                ner_tmp.append(words[i])
                continue
            elif tag.startswith('E'):
                ner_tmp.append(words[i])
                tag = tag.split('-')[-1]
                ner_dict[tag].append(''.join(ner_tmp))
                ner_tmp = []
        if ner_tmp:
            tag = list(nertags)[-1]
            tag = tag = tag.split('-')[-1]
            ner_dict[tag].append(''.join(ner_tmp))

        ner_map = dict()
        ner_map['PER'] = ner_dict['Nh']
        ner_map['ORG'] = ner_dict['Ni']
        ner_map['LOC'] = ner_dict['Ns']

        return ner_map

    def release(self):
        self._segmentor.release()
        self._recognizer.release()
        self._postagger.release()
示例#21
0
文件: cut.py 项目: spikems/freq_tool
#coding:utf-8
from pyltp import Postagger
from pyltp import Parser
import sys
import os
import jieba
import chardet

reload(sys)
sys.setdefaultencoding('utf-8')
dir_path = os.path.dirname(os.path.abspath(__file__))
LTP_DATA_DIR = '/home/wangwei/hotword/hotword/conf/ltp_data'
jieba.load_userdict("/home/wangwei/hotword/hotword/conf/jieba_lexicon")
postagger = Postagger()  # 初始化实例
pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
postagger.load_with_lexicon(pos_model_path, '/home/wangwei/model/posttags.txt')
# tmplist =[]
# with open(sys.argv[2],'rb') as f:
#    for line in f:
#        if line:
#            tmplist.append(line.strip())


def cut_word(sents):
    """
    分词
    """
    words = [i.encode('utf-8', 'ignore')
             for i in jieba.cut(sents, HMM=False)]  # HMM=False
    #    print sents, '\t'.join(words)
    return words
示例#22
0
class FindAttribute(object):
    def __init__(self, moniter_word, Dsent, industry_id):
        self.moniter_word = moniter_word
        self.industry_id = industry_id
        self.Dsent = Dsent
        self.postagger = Postagger()  # 初始化实例
        self.postagger.load_with_lexicon(pos_model_path,
                                         '%s/conf/posttags.txt' % dir_path)
        # self.parser = Parser()  # 初始化实例
        # self.parser.load(par_model_path)  # 加载模型
        self.sql = mysqls()
        self.carattributes, self.sysn, self.dup_word = self.sql.run(
            industry_id)
        self.n_v = []

    def cut_word(self, sents):
        # 分词
        words = [i.encode('utf-8', 'ignore')
                 for i in norm_cut(sents)]  # HMM=False
        num = 0
        # 处理同义词
        for w in words:
            if w in self.sysn.keys():
                words[num] = self.sysn[w]
            num += 1
        return words

    def word_sex(self, ):
        # 获取词性
        postags = list(self.postagger.postag(self.words))  # 词性标注
        num = 0
        #副词或者形容词前面的一个词
        for tag in postags:
            if tag in ['a', 'd']:
                if num != 0 and postags[num - 1] in ['n', 'v']:
                    if self.words[num - 1] not in self.carattributes \
                            and len(self.words[num - 1].decode('utf-8','ignore')) > 1:
                        self.n_v.append(self.words[num - 1])
            #动词或者n词
            if tag in ['n', 'v'] and num == 0:
                if self.words[num] not in self.carattributes\
                        and len(self.words[num].decode('utf-8','ignore')) > 1:
                    # self.words[num] not in self.dup_word \
                    self.n_v.append(self.words[num])
            num += 1
        # print '词性', '\t'.join(postags)
        return postags

    def prepare(self, ):
        for id, sentences in self.Dsent.items():
            split_sentence = re.split(
                ur'[,,()()、: …~?。!. !?]?',
                sentences.decode('utf-8', 'ignore').strip())
            for sent in split_sentence:
                self.words = self.cut_word(sent.encode('utf-8', 'ignore'))
                self.postags = self.word_sex()

        # self.segmentor.release()  # 释放模型
        # outfile = open('attribute_dup.txt', 'a')
        # for word in set(self.n_v):
        cword = Counter(self.n_v)
        lresult = heapq.nlargest(500, cword.items(), key=lambda x: x[1])
        lword = []
        for rg in lresult:
            w, n = rg
            lword.append(w)
        # self.sql.insert(self.industry_id, lword)
        self.postagger.release()  # 释放模型
        # self.parser.release()  # 释放模型
        # outfile.close()
        return lresult
示例#23
0
class LtpParser():
    def __init__(self):
        LTP_DIR = "E:\\study\\Projects\\data-mining\\ltp\\ltp_data_v3.4.0"
        self.segmentor = Segmentor()
        #self.segmentor.load_with_lexicon(os.path.join(LTP_DIR, "cws.model"), os.path.join(LTP_DIR, "word_dict")) #加载外部词典

        self.postagger = Postagger()
        self.postagger.load_with_lexicon(os.path.join(LTP_DIR, "pos.model"), os.path.join(LTP_DIR, "n_word_dict")) #加载外部词典

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model")) #依存句法分析

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))#实体识别

        self.labeller = SementicRoleLabeller()  # 语义角色标注
        self.labeller.load(os.path.join(LTP_DIR, "pisrl_win.model"))

        #加载停用词
        with open(LTP_DIR + '\\stopwords.txt', 'r', encoding='gbk') as fread:
            self.stopwords = set()
            for line in fread:
                self.stopwords.add(line.strip())

    '''把实体和词性给进行对应'''
    def wordspostags(self, name_entity_dist, words, postags):
        pre = ' '.join([item[0] + '/' + item[1] for item in zip(words, postags)])
        post = pre
        for et, infos in name_entity_dist.items():
            if infos:
                for info in infos:
                    post = post.replace(' '.join(info['consist']), info['name'])
        post = [word for word in post.split(' ') if len(word.split('/')) == 2 and word.split('/')[0]]
        words = [tmp.split('/')[0] for tmp in post]
        postags = [tmp.split('/')[1] for tmp in post]

        return words, postags

    '''根据实体识别结果,整理输出实体列表'''
    def entity(self, words, netags, postags):
        '''
        :param words: 词
        :param netags: 实体
        :param postags: 词性
        :return:
        '''
        name_entity_dict = {}
        name_entity_list = []
        place_entity_list = []
        organization_entity_list = []
        ntag_E_Nh = ""
        ntag_E_Ni = ""
        ntag_E_Ns = ""
        index = 0
        for item in zip(words, netags):
            word = item[0]
            ntag = item[1]
            if ntag[0] != "O":
                if ntag[0] == "S":
                    if ntag[-2:] == "Nh":
                        name_entity_list.append(word + '_%s ' % index)
                    elif ntag[-2:] == "Ni":
                        organization_entity_list.append(word + '_%s ' % index)
                    else:
                        place_entity_list.append(word + '_%s ' % index)
                elif ntag[0] == "B":
                    if ntag[-2:] == "Nh":
                        ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index
                    elif ntag[-2:] == "Ni":
                        ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index
                    else:
                        ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index
                elif ntag[0] == "I":
                    if ntag[-2:] == "Nh":
                        ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index
                    elif ntag[-2:] == "Ni":
                        ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index
                    else:
                        ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index
                else:
                    if ntag[-2:] == "Nh":
                        ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index
                        name_entity_list.append(ntag_E_Nh)
                        ntag_E_Nh = ""
                    elif ntag[-2:] == "Ni":
                        ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index
                        organization_entity_list.append(ntag_E_Ni)
                        ntag_E_Ni = ""
                    else:
                        ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index
                        place_entity_list.append(ntag_E_Ns)
                        ntag_E_Ns = ""
            index += 1
        name_entity_dict['nhs'] = self.modify(name_entity_list, words, postags, 'nh')
        name_entity_dict['nis'] = self.modify(organization_entity_list, words, postags, 'ni')
        name_entity_dict['nss'] = self.modify(place_entity_list, words, postags, 'ns')
        return name_entity_dict

    def modify(self, entity_list, words, postags, tag):
        modify = []
        if entity_list:
            for entity in entity_list:
                entity_dict = {}
                subs = entity.split(' ')[:-1]
                start_index = subs[0].split('_')[1]
                end_index = subs[-1].split('_')[1]
                entity_dict['stat_index'] = start_index
                entity_dict['end_index'] = end_index
                if start_index == entity_dict['end_index']:
                    consist = [words[int(start_index)] + '/' + postags[int(start_index)]]
                else:
                    consist = [words[index] + '/' + postags[index] for index in
                               range(int(start_index), int(end_index) + 1)]
                entity_dict['consist'] = consist
                entity_dict['name'] = ''.join(tmp.split('_')[0] for tmp in subs) + '/' + tag
                modify.append(entity_dict)
        return modify

    '''词性和实体'''
    def post_ner(self, words):
        postags = list(self.postagger.postag(words))
        # words_filter =[]
        # postags = []
        # for word, postag in zip(words, self.postagger.postag(words)):
        #     if 'n' in postag:
        #         postags.append(postag)
        #         words_filter.append(word)
        nerags = self.recognizer.recognize(words, postags)
        return postags, nerags

    def parser_process(self, sentence):
        words = list(jieba.cut(sentence))
        post, ner = self.post_ner(words)  # 词性和实体
        name_entity_dist = self.entity(words, ner, post)
        words, postags = self.wordspostags(name_entity_dist, words, post)
        return words, postags

    def analysis(self,sentence):
        words = list(jieba.cut(sentence))
        post, ner = self.post_ner(words)  # 词性和实体
        arcs = self.parser.parse(words, post)
        return words,arcs

    def getWord(self,Type, arcs, words):
        res = None
        for i in range(len(words)):
            if arcs[i].relation == Type:
                res = words[i]
                break
        return res

    def getFirst(self,List):
        for i in List:
            if i is not None:
                return i
        return None

    def getMain(self,sentence):
        words, arcs = self.analysis(sentence)
        hed = self.getWord("HED", arcs, words)

        sbv =self.getWord("SBV", arcs, words)
        vob =self.getWord("VOB", arcs, words)
        fob =self.getWord("FOB", arcs, words)

        adv =self.getWord("ADV", arcs, words)
        pob =self.getWord("POB", arcs, words)

        zhu =self.getFirst([sbv, pob])
        wei = hed
        bin = self.getFirst([vob, fob, pob])

        string = '{}{}{},(副词:{})'.format(zhu, wei, bin, adv)
        return string.replace('None', '')

    def release_model(self):
        # 释放模型
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()
示例#24
0
class LTPParser(Parser):
    """
    基于LTP实现的Parser

    LTP对用户自定义词典的支持不是很好,http://www.ltp-cloud.com/support/
    1. 扩展自定义词典后,需要重新编译LTP
    2. 分词支持自定义词典,但词性标注不支持
    """
    def __init__(self,
                 ltp_model_dir,
                 custom_seg_file=None,
                 custom_pos_file=None):
        """
        :param ltp_model_dir:
        """

        super(LTPParser, self).__init__()

        self._ltp_dir = ltp_model_dir
        '''加载分词模型'''
        seg_model_file = os.path.join(self._ltp_dir, 'cws.model')
        self._segmentor = Segmentor()
        if custom_seg_file:
            self._segmentor.load_with_lexicon(seg_model_file, custom_seg_file)
        else:
            self._segmentor.load(seg_model_file)
        '''加载词性标注模型'''
        self._tagger = Postagger()
        pos_model_file = os.path.join(self._ltp_dir, 'pos.model')
        if custom_pos_file:
            self._tagger.load_with_lexicon(pos_model_file, custom_pos_file)
        else:
            self._tagger.load(pos_model_file)
        '''加载命名实体识别模型'''
        self._ner = NamedEntityRecognizer()
        self._ner.load(os.path.join(self._ltp_dir, 'ner.model'))
        '''加载依存句法分析模型'''
        self._parser = LParser()
        self._parser.load(os.path.join(self._ltp_dir, 'parser.model'))

    def segment(self, txt):
        return list(self._segmentor.segment(txt))

    def pos(self, txt, cache=False):

        result = None

        if cache:
            result = self._get_from_cache(txt)

        if result is None:
            tokenized = self.segment(txt)
            tags = self._tagger.postag(tokenized)

            result = []
            for i, w, t in zip(list(range(len(tokenized))), tokenized, tags):
                result.append(Token(w, t, i))

            self._set_cache(txt, result)

        return result

    def ner(self, txt):
        tokens = self.pos(txt)
        return list(
            self._ner.recognize([t.word for t in tokens],
                                [t.pos for t in tokens]))

    def parse2relations(self, txt):
        tokens = self.pos(txt, revise=True)

        words = [t.word for t in tokens]
        tags = [t.pos for t in tokens]

        arcs = self._parser.parse(words, tags)

        result = []
        for i, w, p, a in zip(list(range(len(words))), words, tags, arcs):
            head_token = Token(words[a.head - 1] if a.head > 0 else 'Root',
                               tags[a.head - 1] if a.head > 0 else 'Root',
                               a.head - 1)
            dep_token = Token(w, p, i)

            result.append(Relation(a.relation, head_token, dep_token))

        return result

    def parse2sents(self, txt):
        sents = []

        for sent_txt in self.ssplit(txt):
            sent_relations = self.parse2relations(sent_txt + '。')
            tokens = set()

            for relation in sent_relations:
                if relation.token1.word != 'ROOT':
                    tokens.add(relation.token1)
                tokens.add(relation.token2)

            tokens = sorted(tokens, key=lambda t: t.id)

            # sent = Sentence(''.join([w.word for w in tokens]))
            sent = Sentence(sent_txt)

            sent.tokens = tokens
            sent.relations = sent_relations

            sents.append(sent)

        return sents
示例#25
0
class LtpParser:
    def __init__(self):
        LTP_DIR = "../../res/ltp/ltp_data_v3.4.0"
        LTP_DIR_USER = "******"
        self.segmentor = Segmentor()
        self.segmentor.load_with_lexicon(os.path.join(LTP_DIR, "cws.model"), os.path.join(LTP_DIR_USER, "fulluserdict.txt"))
        # self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load_with_lexicon(os.path.join(LTP_DIR, "pos.model"), os.path.join(LTP_DIR_USER, "fulluserdict.txt"))
        # self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model'))

    '''语义角色标注'''

    def format_labelrole(self, words, postags):
        arcs = self.parser.parse(words, postags)
        roles = self.labeller.label(words, postags, arcs)
        roles_dict = {}
        for role in roles:
            roles_dict[role.index] = {arg.name: [arg.name, arg.range.start, arg.range.end] for arg in role.arguments}
        return roles_dict

    def build_parse_child_dict_two(self, words, arcs):
        """
        为句子中的每个词语维护一个保存句法依存儿子节点的字典
        Args:
            words: 分词列表
            postags: 词性列表
            arcs: 句法依存列表
        """
        child_dict_list = []
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):
                if arcs[arc_index].head == index + 1:
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
            # if child_dict.has_key('SBV'):
            #    print words[index],child_dict['SBV']
            child_dict_list.append(child_dict)
        return child_dict_list

    '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典'''

    def build_parse_child_dict(self, words, postags, arcs):
        # print(words, postags, "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))

        child_dict_list = []
        format_parse_list = []
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):

                if arcs[arc_index].head == index + 1:  # arcs的索引从1开始
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
            child_dict_list.append(child_dict)
        rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
        relation = [arc.relation for arc in arcs]  # 提取依存关系
        heads = ['Root' if id == 0 else words[id - 1] for id in rely_id]  # 匹配依存父节点词语
        for i in range(len(words)):
            # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n']
            a = [relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1, postags[rely_id[i] - 1]]
            format_parse_list.append(a)

        return child_dict_list, format_parse_list

    '''parser主函数'''

    def parser_main(self, sentence):
        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        arcs = self.parser.parse(words, postags)
        child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, arcs)
        parse_child_dict = self.build_parse_child_dict_two(words, arcs)
        roles_dict = self.format_labelrole(words, postags)
        return words, postags, child_dict_list, roles_dict, format_parse_list, parse_child_dict

    '''parser主函数'''

    def parser_main_two(self, sentence):
        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        arcs = self.parser.parse(words, postags)
        # 命名实体识别,主要是hi识别一些人名,地名,机构名等。
        netags = self.recognizer.recognize(words, postags)
        # 格式化数据
        child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, arcs)
        # 语义角色
        roles_dict = self.format_labelrole(words, postags)
        return words, postags, netags, arcs, child_dict_list, format_parse_list, roles_dict
示例#26
0
class PyLTPEntityExtractor(EntityExtractor):
    name = 'PyLTP_entity_extractor'

    provides = ["entities"]

    requires = ['tokens']

    defaults = {
        "model_path": None,  # Nh: name Ni: organization Ns: place
        "part_of_speech": ['nh'],
        "rename_to_entity": ['username'],  # rename 'nh' to 'username'
        "dictionary_path": None  # customize dictionary
    }

    def __init__(self, component_config=None):
        # type: (Optional[Dict[Text, Text]]) -> None

        super(PyLTPEntityExtractor, self).__init__(component_config)
        self.model_path = self.component_config.get('model_path')
        self.dictionary_path = self.component_config.get('dictionary_path')

        self.segmentor = Segmentor()
        self.postagger = Postagger()
        if self.dictionary_path is None:
            self.segmentor.load(self.model_path + "/cws.model")
            self.postagger.load(self.model_path + "/pos.model")
        else:
            self.segmentor.load_with_lexicon(self.model_path + "/cws.model",
                                             self.dictionary_path)
            self.postagger.load_with_lexicon(self.model_path + "/pos.model",
                                             self.dictionary_path)

    @classmethod
    def create(cls, cfg):
        component_conf = cfg.for_component(cls.name, cls.defaults)
        return PyLTPEntityExtractor(component_conf)

    @classmethod
    def required_packages(cls):
        # type: () -> List[Text]
        return ["pyltp"]

    def process(self, message, **kwargs):
        # type: (Message, **Any) -> None
        extracted = self.add_extractor_name(self.extract_entities(message))
        message.set("entities", extracted, add_to_output=True)

    def extract_entities(self, message):
        # type: (Message) -> List[Dict[Text, Any]]
        # Set your own model path
        sentence = message.text
        words = self.segmentor.segment(sentence)
        postags = self.postagger.postag(words)
        result = zip(words, postags)

        raw_entities = message.get("entities", [])

        for word, postag in result:
            part_of_speech = self.component_config["part_of_speech"]
            rename_to_entity = self.component_config["rename_to_entity"]

            if postag in part_of_speech:
                start = sentence.index(word)
                end = start + len(word)

                entity_index = part_of_speech.index(postag)
                rename_entity = rename_to_entity[entity_index] or postag

                hasAlreadyExtractor = False

                for obj in raw_entities:
                    if obj and obj['value'] == word:
                        hasAlreadyExtractor = True

                if not hasAlreadyExtractor:
                    raw_entities.append({
                        'start': start,
                        'end': end,
                        'value': word,
                        'entity': rename_entity
                    })
        return raw_entities

    @classmethod
    def load(
            cls,
            model_dir=None,  # type: Optional[Text]
            model_metadata=None,  # type: Optional[Metadata]
            cached_component=None,  # type: Optional[Component]
            **kwargs  # type: **Any
    ):

        meta = model_metadata.for_component(cls.name)

        return cls(meta)
示例#27
0
class NLPExecutor:
    def __init__(self):
        self.seg = Segmentor()
        self.seg.load(cwsPath)
        self.pos = Postagger()
        self.pos.load(posPath)
        self.parser = Parser()
        self.parser.load(parserPath)
        self.tr = TextRank4Sentence()

    '''
    param:
        text:输入文本
    return:
        摘要的句子list
    '''

    def generateSummary(self, text):
        # TODO 摘要生成实现方法待改进
        self.tr.analyze(text=text)
        return self.tr.get_key_sentences(num=1)

    '''
    param:
        text:输入文本
    return:
        分句的句子list
    '''

    def splitSentences(self, text):
        return list(SentenceSplitter.split(text))

    '''
    param:
        sent1,sent2:两个句子
    return:
        两个句子的相似度
    '''

    def similarity(self, sent1, sent2):
        if sent1 == '' or sent2 == '':
            return 0
        text1 = self.wordTokenize(sent1)
        text2 = self.wordTokenize(sent2)
        texts = [text1, text2]
        dictionary = corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        similarity = Similarity('-Similarity-index',
                                corpus,
                                num_features=len(dictionary))
        return similarity[dictionary.doc2bow(text1)][1]

    # TODO VALIADATES THAT添加放在RUCM生成层
    '''
    def addValidate(self,sentence):
        tokens=self.wordTokenize(sentence)
        tokens[1]='VALIDATES THAT'
        return ''.join(tokens)
    '''
    '''
    param:
        sentence:一个句子
    return:
        分词词链,list,标点符号会被作为一个词
    '''

    def wordTokenize(self, sentence):
        return list(self.seg.segment(sentence))

    '''
    param:
        sentence:一个句子
        wordlist:分词词链
    return:
        仅有词性标注的词性链,index与分词词链对应
    '''

    def posTag(self, sentence=None, wordlist=None):
        if sentence is not None:
            wordlist = list(self.seg.segment(sentence))
        return list(self.pos.postag(wordlist))

    '''
    param:
        sentence:分词词典的文件路径,每个词独占一行的纯文本文件
        wordlist:标注词典的文件路径,每个词及其词性占一行,词与词性标注之间空格分隔,可以有多个词性
    return:
        无
    '''

    def dictUpdate(self, segDict=None, posDict=None):
        if segDict is not None:
            self.seg.load_with_lexicon(cwsPath, segDict)
        if posDict is not None:
            self.pos.load_with_lexicon(posPath, posDict)

    '''
    param:
        sentence:原始句子
        wordlist:句子的分词词链
        poslist:词性标注词链
    return:
        依存句法分析结果
    '''

    def parse(self, wordlist=None, text=None):
        if text is not None:
            wordlist = self.wordTokenize(text)
        poslist = self.posTag(wordlist=wordlist)
        return list(self.parser.parse(wordlist, poslist))

    '''
    param:
        sentence:Sentence对象
        parselist:依存句法分析结果
    return:
        规范化句式之后的句子
    '''

    def normalize(self, sentence, parselist=None):  # TODO 效果在调试时继续调整):
        wordlist = sentence.wordlist
        poslist = self.posTag(wordlist=wordlist)
        if parselist is None:
            parselist = self.parse(wordlist=wordlist)
        newWords = wordlist.copy()
        # TODO 替换IF,ELSE,THEN,DO,UNTIL
        #if sentence.type == 'conditional':
        # TODO
        if sentence.type != 'then':
            for i in range(0, len(wordlist)):
                if wordlist[i] == '如果':
                    newWords[i] = 'IF'
                    sentence.type = 'conditional'
                elif wordlist[i] == '那么':
                    newWords[i] = 'THEN'
                elif wordlist[i] == '否则':
                    newWords[i] = 'ELSE'
                elif wordlist[i] == '直到':
                    newWords[i] = 'UNTIL'
                    if sentence.type != 'conditional':
                        sentence.type = 'circular'
                elif wordlist[i] == '同时':
                    newWords[i] = 'MEANWHILE'
        #TODO 去量词效果
        if sentence.type == 'then' or sentence.type == 'normal':
            for i in range(len(parselist) - 1, -1, -1):
                if parselist[i].relation == 'ATT' and (poslist[i] == 'm'
                                                       or poslist[i] == 'q'):
                    del newWords[i]
        if sentence.normalContent is None:
            sentence.normalContent = ''
        for word in newWords:
            sentence.normalContent += word

    '''
    param:
        parselist:依存句法分析结果
    return:
        是否为简单句
    '''

    def isSimple(self, parselist):
        count = 0
        for parse in parselist:
            if parse.relation == 'SBV':
                count += 1
        if count == 1:
            return True
        else:
            return False

    '''
    param:
        sentlist:句子集合
        sent:单个句子
    return:
        sentlist中与sent相似度最高的句子的索引与相似度
    '''

    def maxSimilarity(self, sentlist, sent):
        max = [-1, -1]
        for i in range(len(sentlist)):
            similarity = self.similarity(sentlist[i].originContent,
                                         sent.originContent)
            if similarity > max[1]:
                max = [i, similarity]
        return max
示例#28
0
class SentenceParser(object):
    """
    A class for segmenting text
    """
    def __init__(self):
        """
        Initial
        """
        self.sen_split = SentenceSplitter()
        self.seg = Segmentor()
        self.seg.load_with_lexicon(CWS_MODEL, "resource/lexicon")
        self.pos = Postagger()
        self.pos.load_with_lexicon(POS_MODEL, "resource/lexicon")
        self.parser = Parser()
        self.parser.load(PARSER_MODEL)

        self.rule = IterDocument("resource/rule")

    def seg_sentence(self, text):
        return self.sen_split.split(text)

    def seg_token(self, text):
        """
        :param text: the raw string
        :return: a list of token
        """
        return self.seg.segment(text)

    def pos_tag(self, words):
        """
        :param words: the list of token
        :return: a list of pos
        """
        return self.pos.postag(words)

    def parse(self, words, pos):
        if len(words) == 0 or len(pos) == 0:
            return WordNode("", "", "", None)
        arcs = self.parser.parse(words, pos)

        # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
        nodes = list(map(lambda x: (x.head, x.relation), arcs))

        root_idx = find_x(nodes, 0)
        root = WordNode(words[root_idx[0]], pos[root_idx[0]],
                        nodes[root_idx[0]][1])
        tree = {root_idx[0]: root}
        queue = root_idx

        while len(queue):
            next_idx = queue.pop()
            for idx in find_x(nodes, next_idx + 1):
                queue.insert(0, idx)
                new_node = WordNode(words[idx], pos[idx], nodes[idx][1])
                tree[next_idx].next.append(new_node)
                tree[idx] = new_node

        return root

    def extract(self, path):
        res = []
        if len(path) == 0:
            return res
        rule = self.rule
        for p in path:
            for r in rule:
                window_size = len(r.split(";"))
                if len(p) == window_size:
                    if ";".join(map(lambda x: "%s,%s" % (x.relation, x.pos),
                                    p)) == r:
                        res.append("".join(map(lambda x: x.token, p)))
                else:
                    for i in range(len(p) - window_size):
                        p_slice = ";".join(
                            map(lambda x: "%s,%s" % (x.relation, x.pos),
                                p[i:i + window_size]))
                        if p_slice == r:
                            res.append("".join(
                                map(lambda x: x.token, p[i:i + window_size])))
                            break
        return res
示例#29
0
文件: ner03.py 项目: jiangq195/tanxin
])

X = pd.read_csv('./x.csv')
corpus = X['ner'].map(f).tolist()

# print(corpus)

tfidf = TfidfVectorizer()
tfidf.fit(corpus)
tfidf_train = tfidf.transform(corpus)

tfidf_feature = pd.DataFrame(tfidf_train.toarray())

postagger = Postagger()  # 初始化实例
# postagger.load_with_lexicon('F:\ltp_data\pos.model', '../data/user_dict.txt')  # 加载模型
postagger.load_with_lexicon('E:\ltp_data\pos.model',
                            '../data/user_dict.txt')  # 加载模型


def parse(s):
    """
    对语句进行句法分析,并返回句法结果
    parse_result:依存句法解析结果
    source:企业实体的词序号
    target:另一个企业实体的词序号
    keyword_pos:关键词词序号列表
    source_dep:企业实体依存句法类型
    target_dep:另一个企业实体依存句法类型
    """
    tmp_ner_dict = {}
    num_lst = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十']
示例#30
0
class LtpParser:
    def __init__(self):
        LTP_DIR = "D:\python\ltp_data_v3.4.0"
        Segmentor_lexicon = 'D:\python\ltp_data_v3.4.0\lexicon'
        Segmentor_label_lexicon = 'D:\python\ltp_data_v3.4.0\lexicon_label'
        Postagger_lexicon = 'D:\python\ltp_data_v3.4.0\lexicon_1'
        Postagger_label_lexicon = 'D:\python\ltp_data_v3.4.0\lexicon_label_1'
        self.segmentor = Segmentor()
        self.segmentor_label = Segmentor()
        cws_model_path = os.path.join(LTP_DIR, "cws.model")
        self.segmentor.load_with_lexicon(
            cws_model_path, Segmentor_lexicon)  # 加载模型,第二个参数是您的外部词典文件路径
        self.segmentor_label.load_with_lexicon(
            cws_model_path, Segmentor_label_lexicon)  # 加载模型,第二个参数是您的外部词典文件路径

        self.postagger = Postagger()
        self.postagger_label = Postagger()
        pos_model_path = os.path.join(LTP_DIR, "pos.model")
        self.postagger.load_with_lexicon(pos_model_path, Postagger_lexicon)
        self.postagger_label.load_with_lexicon(pos_model_path,
                                               Postagger_label_lexicon)

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model'))

    '''语义角色标注'''

    def format_labelrole(self, words, postags):
        arcs = self.parser.parse(words, postags)
        roles = self.labeller.label(words, postags, arcs)
        roles_dict = {}
        for role in roles:
            #print('role.index:',role.index)
            roles_dict[role.index] = {
                arg.name: [arg.name, arg.range.start, arg.range.end]
                for arg in role.arguments
            }
        return roles_dict

    '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典'''

    def build_parse_child_dict(self, words, postags, arcs):
        child_dict_list = []
        format_parse_list = []
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):
                if arcs[arc_index].head == index + 1:  #arcs的索引从1开始
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
            child_dict_list.append(child_dict)
        rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
        relation = [arc.relation for arc in arcs]  # 提取依存关系
        heads = ['Root' if id == 0 else words[id - 1]
                 for id in rely_id]  # 匹配依存父节点词语
        for i in range(len(words)):
            # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n']
            a = [
                relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1,
                postags[rely_id[i] - 1]
            ]
            format_parse_list.append(a)

        return child_dict_list, format_parse_list

    '''parser主函数'''

    def parser_main(self, sentence):
        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        arcs = self.parser.parse(words, postags)
        child_dict_list, format_parse_list = self.build_parse_child_dict(
            words, postags, arcs)
        roles_dict = self.format_labelrole(words, postags)
        return words, postags, child_dict_list, roles_dict, format_parse_list

    def tag_entity_annotation(self, entity):
        words = self.segmentor.segment(entity)
        postags = self.postagger.postag(words)
        netags = self.recognizer.recognize(words, postags)  # 命名实体识别
        #print('\t'.join(netags))
        return words, postags, netags

    def tag_entity_annotation_v2(self, entity):
        words = self.segmentor_label.segment(entity)
        postags = self.postagger_label.postag(words)
        netags = self.recognizer.recognize(words, postags)  # 命名实体识别
        #print('\t'.join(netags))
        return words, postags, netags