Пример #1
0
def segmentation(filename, output_filename):

    print "segmenting '%s' to '%s'" % (filename, output_filename)

    f = open(filename, "r")
    lines = f.readlines()
    f.close()

    MODELDIR = "./ltp_data/"

    # segment
    segmentor = Segmentor()
    segmentor.load(os.path.join(MODELDIR, "cws.model"))

    # postag
    postagger = Postagger()
    postagger.load(os.path.join(MODELDIR, "pos.model"))
    
    # Named Entity Recognize
    recognizer = NamedEntityRecognizer()
    recognizer.load(os.path.join(MODELDIR, "ner.model"))
    
    # Parse and get SVO
    parser = Parser()
    parser.load(os.path.join(MODELDIR, "parser.model"))
    
    f = open(output_filename, "w")
    fner = open(output_filename.split(".")[0]+"_ner.txt", "w")

    for _line in lines:
        line = _line[:-1]
        if line[-1] in "\n\r":
            line = line[:-1]
        
        words = segmentor.segment(line)
        postags = postagger.postag(words)
#        netags = recognizer.recognize(words, postags)
#        arcs = parser.parse(words, postags)

        for i in range(len(words)):
            f.write( "%s/%s\t" % (words[i], postags[i]))
#            if netags[i]!='O':
#                fner.write("%s/%s\t" % (words[i], netags[i]))
        f.write("\n")
#        fner.write("\n")

    f.close()
Пример #2
0
 def get_postags(self, words):
     postagger = Postagger()  # 初始化实例
     postagger.load(self.pos_model_path)  # 加载模型
     postags = postagger.postag(words)  # 词性标注
     print('\t'.join(postags))
     postagger.release()  # 释放模型
     return list(postags)
Пример #3
0
def ltp_pos_data():
    """使用 LTP 进行词性标注"""
    LTP_DATA_DIR = 'D:\BaiduNetdiskDownload\ltp_data_v3.4.0'  # ltp模型目录的路径
    pos_model_path = os.path.join(LTP_DATA_DIR,
                                  'pos.model')  # 词性标注模型路径,模型名称为`pos.model`

    from pyltp import Postagger
    postagger = Postagger()  # 初始化实例
    postagger.load(pos_model_path)  # 加载模型
    result = []
    file = [(const.qc_train_seg, const.qc_train_pos),
            (const.qc_test_seg, const.qc_test_pos)]
    for i in range(2):
        with open(file[i][0], 'r', encoding='utf-8') as f:
            for line in f.readlines():
                attr = line.strip().split('\t')
                words = attr[1].split(" ")
                words_pos = postagger.postag(words)
                res = ' '.join([
                    "{}/_{}".format(words[i], words_pos[i])
                    for i in range(len(words))
                ])
                result.append("{}\t{}\n".format(attr[0], res))
        with open(file[i][1], 'w', encoding='utf-8') as f:
            f.writelines(result)
        result.clear()
    postagger.release()  # 释放模型
def cut_words():
    #分词+去除空行
    #词性标注集http://ltp.readthedocs.io/zh_CN/latest/appendix.html
    cont = open('resource_new.txt', 'r', encoding='utf-8')
    f = open('key/cut_resouce.txt', 'w', encoding='utf-8')
    segmentor = Segmentor()  # 初始化实例
    # segmentor.load('cws.model')  # 加载模型,不加载字典
    segmentor.load_with_lexicon('module/cws.model',
                                'userdict.txt')  # 加载模型,加载用户字典
    postagger = Postagger()  # 初始化实例
    postagger.load('module/pos.model')  # 加载模型
    for sentence in cont:
        if sentence.strip() != '':
            words = segmentor.segment(sentence)  # 分词
            pos_tags = postagger.postag(words)  # 词性标注
            for word, tag in zip(words, pos_tags):
                if tag != 'wp':
                    f.write(word)
                else:
                    f.write('\n')
            f.write('\n')
        else:
            continue
    f.close()
    segmentor.release()
    postagger.release()
Пример #5
0
class LtpParser:
    def __init__(self):
        LTP_DIR = "ltp_data_v3.4.0"
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))


    '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典'''
    def build_parse_child_dict(self, words, postags, arcs):
        child_dict_list = []
        format_parse_list = []
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):
                if arcs[arc_index].head == index+1:   #arcs的索引从1开始
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
            child_dict_list.append(child_dict)
        rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
        relation = [arc.relation for arc in arcs]  # 提取依存关系
        heads = ['Root' if id == 0 else words[id - 1] for id in rely_id]  # 匹配依存父节点词语
        for i in range(len(words)):
            # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n']
            a = [relation[i], words[i], i, postags[i], heads[i], rely_id[i]-1, postags[rely_id[i]-1]]
            format_parse_list.append(a)

        return child_dict_list, format_parse_list

    '''parser主函数'''
    def parser_main(self, sentence):
        words = list(self.segmentor.segment(sentence))   # 分词
        postags = list(self.postagger.postag(words))     # 词性标注
        arcs = self.parser.parse(words, postags)         # 句法分析
        child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, arcs)

        return words, postags, child_dict_list, format_parse_list
Пример #6
0
def test_ltp(document):

    LTP_DATA_DIR = r"D:\anaconda\envs\TF+3.5\Lib\site-packages\pyltp-model"
    # ltp模型目录的路径
    par_model_path = os.path.join(
        LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`
    cws_model_path = os.path.join(LTP_DATA_DIR,
                                  'cws.model')  # 分词模型路径,模型名称为`cws.model`
    pos_model_path = os.path.join(LTP_DATA_DIR,
                                  'pos.model')  # 词性标注模型路径,模型名称为`pos.model`

    segmentor = Segmentor()  # 初始化实例
    segmentor.load(cws_model_path)  # 加载模型
    words = segmentor.segment(document)  # 分词
    print("\nA")
    print("分词结果:")
    print('\t'.join(words))
    segmentor.release()  # 释放模型

    postagger = Postagger()  # 初始化实例
    postagger.load(pos_model_path)  # 加载模型
    postags = postagger.postag(words)  # 词性标注
    print("\n")
    print("词性标注结果:")
    print('\t'.join(postags))
    postagger.release()  # 释放模型

    parser = Parser()  # 初始化实例
    parser.load(par_model_path)  # 加载模型
    arcs = parser.parse(words, postags)  # 句法分析
    print("\n")
    print("句法分析结果:")
    print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
    parser.release()  # 释放模型
Пример #7
0
 def get_postag_list(self, word_list, model):
     # 得到词性标注
     postag = Postagger()
     postag.load(model)
     postag_list = list(postag.postag(word_list))
     postag.release()
     return postag_list
Пример #8
0
def namedEntityRecognize(sentence):
    '''
        使用pyltp模块进行命名实体识别
        返回:1)命名实体和类别元组列表、2)实体类别列表
    '''
    namedEntityTagTupleList = []

    segmentor = Segmentor()
    # segmentor.load(inout.getLTPPath(index.CWS))
    segmentor.load_with_lexicon(inout.getLTPPath(index.CWS),
                                inout.getResourcePath('userDic.txt'))
    words = segmentor.segment(sentence)
    segmentor.release()
    postagger = Postagger()
    postagger.load(inout.getLTPPath(index.POS))
    postags = postagger.postag(words)
    postagger.release()
    recognizer = NamedEntityRecognizer()
    recognizer.load(inout.getLTPPath(index.NER))
    netags = recognizer.recognize(words, postags)
    recognizer.release()

    # 封装成元组形式
    for word, netag in zip(words, netags):
        namedEntityTagTupleList.append((word, netag))

    neTagList = '\t'.join(netags).split('\t')

    return namedEntityTagTupleList, neTagList
Пример #9
0
def postags_opt(words):
    # Set pyltp postagger model path
    LTP_DATA_DIR = '../ltp_data_v3.4.0'
    pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')

    # Init postagger
    postagger = Postagger()

    # Load model
    postagger.load(pos_model_path)

    # Get postags
    postags = postagger.postag(words)

    # Close postagger
    postagger.release()

    postags = list(postags)

    # Init result list
    saying_words = []

    # Filter with tag 'verb'
    for index, tag in enumerate(postags):
        if tag == 'v':
            saying_words.append(words[index])

    return saying_words
Пример #10
0
def locationNER(text):
    #先分词
    segmentor = Segmentor()  # 初始化实例
    segmentor.load(cws_model_path)  # 加载模型
    words = segmentor.segment(text)  # 分词
    #print ('\t'.join(words))
    segmentor.release()

    #再词性标注
    postagger = Postagger() # 初始化实例
    postagger.load(pos_model_path)  # 加载模型
    postags = postagger.postag(words)  # 词性标注
    postagger.release()  # 释放模型

    #最后地理实体识别

    recognizer = NamedEntityRecognizer() # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型
    netags = recognizer.recognize(words, postags)  # 命名实体识别
    for i in range (0,len(netags)):
       if 'I-Ns'in netags[i] or 'I-Ni'in netags[i]:
           results.append(words[i-1]+words[i]+words[i+1])
       if 'S-Ns'in netags[i] or 'S-Ni'in netags[i]:
           results.append(words[i])
    return results
Пример #11
0
def word_pos():
#ltp词性标注
    candidate=pd.read_csv(r'../data/candidate_sentiment.csv',header=None)
    can_word=candidate[0].tolist()
    # 新加一列存放词性
    candidate.insert(2,'ltp_pos',0)
    candidate.insert(3,'jieba_pos',0)
    candidate.columns=['word','freq','ltp_pos','jieba_pos']

    LTP_DATA_DIR = '../ltp_data_v3.4.0/ltp_data_v3.4.0'  # ltp模型目录的路径
    pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
    
    
    postagger = Postagger() # 初始化实例
    postagger.load(pos_model_path)  # 加载模型   
    postags = postagger.postag(can_word)  # 词性标注   
    postagger.release()  # 释放模型   
    postags=list(postags)
    candidate['ltp_pos']=postags
#jieba词性标注    
    
    jieba_pos=[]
    for index,row in candidate.iterrows():
        s=row['word']
        words=pseg.cut(s)
        pos=[]
        for w in words:
            pos.append(w.flag)
        pos=' '.join(pos)
        jieba_pos.append(pos)
    
    candidate['jieba_pos']=jieba_pos
#    添加表头
    candidate.to_csv(r'../data/candidate_sentiment.csv',index=None)
Пример #12
0
def get_postag_list(words_list):

    postag = Postagger()
    postag.load(pos_model_path)
    postag_list = list(postag.postag(words_list))
    postag.release()
    return postag_list
Пример #13
0
def segmentsentence(sentence):
    segmentor = Segmentor()
    postagger = Postagger()
    parser = Parser()
    recognizer = NamedEntityRecognizer()

    segmentor.load("./ltpdata/ltp_data_v3.4.0/cws.model")
    postagger.load("./ltpdata/ltp_data_v3.4.0/pos.model")
    # parser.load("./ltpdata/ltp_data_v3.4.0/parser.model")
    recognizer.load("./ltpdata/ltp_data_v3.4.0/ner.model")
    #############
    word_list = segmentor.segment(sentence)
    postags_list = postagger.postag(word_list)
    nertags = recognizer.recognize(word_list, postags_list)
    ############
    for word, ntag in zip(word_list, nertags):
        if ntag == 'Nh':
            entity_list.append(word)
    print(" ".join(word_list))
    print(' '.join(nertags))
    ############
    segmentor.release()
    postagger.release()
    # parser.release()
    recognizer.release()
    return word_list
Пример #14
0
 def __init__(self,):
     LTP_DATA_DIR = SETTINGS.LTP_DATA_DIR # ltp模型目录的路径
     cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
     pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
     par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`
     srl_model_path = os.path.join(LTP_DATA_DIR, 'srl')  # 语义角色标注模型目录路径,模型目录为`srl`。注意该模型路径是一个目录,而不是一个文件。
     ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
 
     self.segmentor = Segmentor()  # 初始化实例
     self.segmentor.load_with_lexicon(cws_model_path,'./segName')  # 加载模型
     self.postagger = Postagger() # 初始化实例
     self.postagger.load_with_lexicon(pos_model_path,'./postagName')  # 加载模型
     self.parser = Parser() # 初始化实例
     self.parser.load(par_model_path)  # 加载模型
     self.labeller = SementicRoleLabeller() # 初始化实例
     self.labeller.load(srl_model_path)  # 加载模型
     self.recognizer = NamedEntityRecognizer() # 初始化实例
     self.recognizer.load(ner_model_path)  # 加载模型
Пример #15
0
    def __init__(self, **kwargs):
        """
        Args:
            annotators: set that can include pos and ner.
            model: ltp model to use (path).
        """
        self.segmentor = Segmentor()  # 初始化分词器实例
        self.recognizer = NamedEntityRecognizer()  # 初始化命名实体识别器实例
        self.postagger = Postagger()  # 初始化词性标注实例

        self.segmentor.load(cws_model_path)  # 加载分词模型

        self.annotators = copy.deepcopy(kwargs.get('annotators', set()))
        if {'pos'} & self.annotators:
            self.postagger.load(pos_model_path)
        if {'ner'} & self.annotators:
            self.postagger.load(pos_model_path)
            self.recognizer.load(ner_model_path)
Пример #16
0
    def restart(self):

        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()
        self.srler.release()

        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(self.MODELDIR, "cws.model"))
        self.postagger = Postagger()
        self.postagger.load(os.path.join(self.MODELDIR, "pos.model"))
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(self.MODELDIR, "ner.model"))
        self.parser = Parser()
        self.parser.load(os.path.join(self.MODELDIR, "parser.model"))
        self.srler = SementicRoleLabeller()
        self.srler.load(os.path.join(self.MODELDIR, "pisrl.model"))
Пример #17
0
    def __init__(self):
        print(111)
        LTP_DIR = "D:\\ltp_data\\ltp_data_v3.4.0"
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model'))
        print(111)
Пример #18
0
    def __init__(self, input_file, output_file, pyltp_path=None):
        # ltp model files
        LTP_DATA_DIR = 'pyltp-resource/ltp-model' if pyltp_path is None else pyltp_path
        cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
        pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
        par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')
        cut_text_path = os.path.join(LTP_DATA_DIR, 'word_segmentation.txt')

        self.input_file = input_file
        self.output_file = output_file
        self.raw_data = []
        self.sentence_cutted = []  # 分词后list,每一个子元素为一个app描述分词后的list
        self.postagger = Postagger()  # 初始化词性标注实例
        self.postagger.load(pos_model_path)  # 加载词性标注模型
        self.parser = Parser()  # 初始化句法分析实例
        self.parser.load(par_model_path)  # 加载句法分析模型
        self.segmentor = Segmentor()
        self.segmentor.load_with_lexicon(cws_model_path, cut_text_path)
Пример #19
0
 def ltp_postagger(self, data, is_zip=True):
     """
     词性参考: https://blog.csdn.net/leiting_imecas/article/details/68485254
     :param data:
     :return:
     """
     if self._postagger is None:
         self._postagger = Postagger()
         self._postagger.load(os.path.join(self._MODELDIR, "pos.model"))
     if isinstance(data, str):
         words = self.ltp_segmentor(data)
     else:
         words = data
     postags = self._postagger.postag(words)
     if is_zip:
         return list(zip(words, postags))
     else:
         return words, postags
Пример #20
0
 def getPostagger(self):
     if Config.c_postagger:
         return Config.c_postagger
     else:
         pos_model_path = os.path.join(Config.ltp_data_dir,
                                       Config.pos_model)
         Config.c_postagger = Postagger()
         Config.c_postagger.load(pos_model_path)
         return Config.c_postagger
Пример #21
0
    def __init__(self):
        LTP_PATH = '/root/tmp/pycharm_project_96/pyltp_test/ltp_data'

        # 分词
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_PATH,'cws.model'))
        # 词性标注
        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_PATH,'pos.model'))
        # 依存句法
        self.parser = Parser()
        self.parser.load(os.path.join(LTP_PATH,'parser.model'))
        # 命名实体识别
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_PATH,'ner.model'))
        # # 语义角色标注
        self.labeller = SementicRoleLabeller()
        self.labeller.label(os.path.join(LTP_PATH,'pisrl.model'))
Пример #22
0
    def __init__(self):
        # LTP_DIR = './ltp_data_v3.4.0'
        print("加载模型路径", LTP_DIR)
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))
        print("加载完毕")
Пример #23
0
    def __init__(self):
        LTP_DIR = "./ltp_data"
        self.lac = LAC(mode='lac')
        self.lac.load_customization('data/custom.txt', sep=None)
        self.ddparser = DDParser(encoding_model='transformer')
        self.fine_info = FineGrainedInfo
        self.keyword = Keyword()
        self.jieba = jieba
        self.posseg = jieba.posseg
        self.segmentor = Segmentor(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger(
            model_path=os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer(
            os.path.join(LTP_DIR, "ner.model"))
Пример #24
0
    def __init__(self):
        LTP_DIR = "./ltp_data"
        self.segmentor = Segmentor()
        # self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))
        self.segmentor.load_with_lexicon(os.path.join(LTP_DIR, "cws.model"),
                                         './dict.txt')

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))
    def __init__(self, dic):
        cws_model_path = os.path.join(config.LTP_DATA_DIR, 'cws.model')
        self.segmentor = Segmentor()
        self.segmentor.load(cws_model_path)

        self.segmentor.load_with_lexicon(cws_model_path, config.dic_path)

        pos_model_path = os.path.join(config.LTP_DATA_DIR, 'pos.model')
        self.postagger = Postagger()
        self.postagger.load(pos_model_path)

        par_model_path = os.path.join(config.LTP_DATA_DIR, 'parser.model')
        self.parser = Parser()
        self.parser.load(par_model_path)

        jieba.load_userdict(dic.keys())

        self.dic = dic
Пример #26
0
def ltp_labeling(info_list, path, mode='train'):
    """
    Split word in querys and write in the disk.
    """
    new_info_list, word_dict, dataset_list, sparse_rate = list(), dict(), list(), 0.0
    segmentor = Segmentor()
    segmentor.load('E://Github/CCF_Competition/data/ltp_data/cws.model')
    postagger = Postagger()
    postagger.load('E://Github/CCF_Competition/data/ltp_data/pos.model')
    idx = 0
    for info in info_list:
        if idx % 100 == 0 :
            print 1.0 * idx / len(info_list)
        idx += 1
        if mode == 'train' :
            user, age, gender, education, querys = info
        elif mode == 'test' :
            user, querys = info
        new_querys = list()
        for query in querys:
            new_query = list()
            words = segmentor.segment(query.encode('utf8'))
            postags = postagger.postag(words)
            for word, pos in zip(words, postags):
                word, pos = word.decode('utf8'), pos.decode('utf8')
                new_query.append((word, pos))
            new_querys.append(new_query)
        if mode == 'train' :
            new_info_list.append((user, age, gender, education, new_querys))
        elif mode == 'test' :
            new_info_list.append((user, new_querys))
    # write in the disk
    with open(path, 'w') as fw:
        for info in new_info_list:
            if mode == 'train' :
                user, age, gender, education, querys = info
                query_str = '\t'.join([' '.join([word+'<:>'+pos for word, pos in query]) for query in querys])
                fw.writelines((user + '\t' + age + '\t' + gender + '\t' + education +\
                               '\t' + query_str + '\n').encode('gb18030'))
            elif mode == 'test' :
                user, querys = info
                query_str = '\t'.join([' '.join([word+'<:>'+pos for word, pos in query]) for query in querys])
                print user, query_str
                fw.writelines((user + '\t' + query_str + '\n').encode('gb18030'))   
Пример #27
0
def get_all_name(r_filename, w_file):
    # global nlp
    LTP_DATA_DIR = r'ltp_data_v3.4.0'  # LTP模型目录路径

    # 分词
    segmentor = Segmentor()  # 初始化
    segmentor.load(os.path.join(LTP_DATA_DIR, 'cws.model'))  # 加载模型
    # words = segmentor.segment(line)  # 分词

    # 词性标注
    postagger = Postagger()  # 初始化
    postagger.load(os.path.join(LTP_DATA_DIR, 'pos.model'))  # 加载模型
    #postags = postagger.postag(words)
    # postags = postagger.postag(['中国', '进出口', '银行', '与', '中国银行', '加强', '合作', '。'])
    #res=[]
    # 命名实体识别
    recognizer = NamedEntityRecognizer()  # 实例化
    recognizer.load(os.path.join(LTP_DATA_DIR, 'ner.model'))

    f_r = open(r_filename, "r", encoding="utf-8")
    f_w = open(w_file, "w", encoding="utf-8")
    count = 0
    for line in f_r:
        count += 1
        line = line.strip(r"\n")
        line = raplace_line_feed(line)
        line = more_space_to_one(line)
        print(line)
        words = segmentor.segment(line)
        postags = postagger.postag(words)
        netags = recognizer.recognize(words, postags)
        name_list = get_name(netags, words)
        if name_list != []:
            print(name_list)
            sen = get_some_idea(line, name_list)
            print(sen)
            if sen:
                for key in sen:
                    # print(sen[key])
                    sens = "\t".join(list(set([data[1] for data in sen[key]])))
                    f_w.write(key + "\t" + sens + "\n")
    # nlp.close()
    f_r.close()
    f_w.close()
Пример #28
0
def new_relation_find(words, sentence):
    """ 新关系发现

    :param words:
    :param sentence:
    :return:
    """
    # 存放三元组的字典
    tuple_dict = dict()
    index0 = -1
    index1 = -1
    bool = False
    for entity_word in entity_words:
        if sentence.find(entity_word) != -1:
            if tuple_dict:
                # 返回为true说明有重复部分
                if has_same(tuple_dict[index0], entity_word):
                    continue
                index1 = sentence.find(entity_word)
                tuple_dict[index1] = entity_word
                bool = True
                break
            else:
                index0 = sentence.find(entity_word)
                tuple_dict[index0] = entity_word
    if bool is False:
        return "", "", ""
    # 排序结果为list
    # tuple_dict = sorted(tuple_dict.items(), key=lambda d: d[0])
    words = "/".join(words).split("/")
    for key, value in tuple_dict.items():
        tuple_word = value
        words = init_words(tuple_word, words)
    # 对于已经重构的词进行词标注
    postagger = Postagger()  # 初始化实例
    pos_model_path = os.path.join(LTP_DATA_DIR,
                                  'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
    postagger.load_with_lexicon(pos_model_path, 'data/postagger.txt')  # 加载模型
    postags = postagger.postag(words)  # 词性标注
    print('\t'.join(postags))
    postagger.release()  # 释放模型
    # 发现新关系
    relation_word = ""
    index_word = 0
    for index, postag in enumerate('\t'.join(postags).split('\t')):
        index_word += len(words[index])
        if index_word >= len(sentence):
            break
        if postag == 'v' and index_word - min(index0, index1) <= 2 and max(index0, index1) - index_word <= 2 \
                and not has_same(tuple_dict[index0], words[index]) and not has_same(tuple_dict[index1],
                                                                                    words[index]) \
                and words[index] not in wrong_relation:
            relation_word = words[index]
            break
    if relation_word == "":
        return "", "", ""
    return tuple_dict[min(index0,
                          index1)], tuple_dict[max(index0,
                                                   index1)], relation_word
Пример #29
0
def posttagger(words):
    postagger = Postagger()  # 初始化实例
    postagger.load(pos_model_path)  # 加载模型
    postags = postagger.postag(words)  # 词性标注
    #for word, tag in zip(words, postags):
    #print(word + '/' + tag)
    postagger.release()  # 释放模型
    return postags
Пример #30
0
def posttagger(words):
    postagger = Postagger()
    postagger.load(r'D:\Corpus\ltp_data_v3.4.0\pos.model')
    posttags = postagger.postag(words)  # 词性标注
    postags = list(posttags)
    postagger.release()  # 释放模型
    # print type(postags)
    return postags
Пример #31
0
def posttagger(words):
    postagger = Postagger()  # 初始化实例
    postagger.load(
        r'D:\SUFE\ComputerContest\QASystem\DrQA-CN-master\data\ltp_data_v3.4.0\pos.model'
    )
    postags = postagger.postag(words)  # 词性标注
    postagger.release()
    return postags
Пример #32
0
def words_cixing(words=["中国","进出口","银行","与","中国银行","加强","合作"],type_list=0,pos=0):
    """词性标注,若type_list=True,则返回以列表返回标注词性后的结果。
    词性标记集:LTP中采用863词性标注集
    词性说明见:http://www.ltp-cloud.com/intro/
    若type_list为真,则返回['ns', 'v', 'n', 'c', 'ni', 'v', 'v']
    若pos为真,则返回['中国/ns', '进出口/v', '银行/n', '与/c', '中国银行/ni', '加强/v', '合作/v']
    默认返回是生成器列表
    """
    if type(words)==str:
        words=split_words(words)
    postagger = Postagger()
    postagger.load(os.path.join(MODELDIR, "pos.model"))
    postags = postagger.postag(words)
    # list-of-string parameter is support in 0.1.5
    # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
    if type_list :
        return [i for i in postags]
    if pos:
        return ['{}/{}'.format(k,v)for k,v in zip(words,[i for i in postags])]
    return postags
Пример #33
0
    def __init__(self):
        self.cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
        self.pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
        self.ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
        segmentor = Segmentor()
        segmentor.load(self.cws_model_path)
        self.words = segmentor.segment(data)
        # print("|".join(words))
        segmentor.release()


        postagger = Postagger() # 初始化实例
        postagger.load(self.pos_model_path)  # 加载模型
        self.postags = postagger.postag(self.words)  # 词性标注
        # print('\t'.join(postags))
        postagger.release()  # 释放模型


        recognizer = NamedEntityRecognizer() # 初始化实例
        recognizer.load(self.ner_model_path)  # 加载模型
        self.netags = recognizer.recognize(self.words, self.postags)  # 命名实体识别
        # print('\t'.join(netags))
        recognizer.release()  # 释放模型
Пример #34
0
# Set your own model path
MODELDIR=os.path.join(ROOTDIR, "ltp_data")

from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller

paragraph = '中国进出口银行与中国银行加强合作。中国进出口银行与中国银行加强合作!'

sentence = SentenceSplitter.split(paragraph)[0]

segmentor = Segmentor()
segmentor.load(os.path.join(MODELDIR, "cws.model"))
words = segmentor.segment(sentence)
print "\t".join(words)

postagger = Postagger()
postagger.load(os.path.join(MODELDIR, "pos.model"))
postags = postagger.postag(words)
# list-of-string parameter is support in 0.1.5
# postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
print "\t".join(postags)

parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))
arcs = parser.parse(words, postags)

print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)

recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))
netags = recognizer.recognize(words, postags)
Пример #35
0
from sklearn.feature_selection import SelectKBest, chi2
from scipy.sparse import hstack


import sys, os

ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir)
sys.path.append(os.path.join(ROOTDIR, "lib"))
# Set your own model path
MODELDIR = os.path.join("/home/fish/", "ltp_data")
from pyltp import Segmentor, Postagger, NamedEntityRecognizer  # @UnresolvedImport

# 分词功能
segmentor = Segmentor()
segmentor.load(os.path.join(MODELDIR, "cws.model"))
postagger = Postagger()
postagger.load(os.path.join(MODELDIR, "pos.model"))
recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))


def ltp(sentence):
    words = segmentor.segment(sentence)
    # 词性标注功能
    postags = postagger.postag(words)
    # 实体识别
    netags = recognizer.recognize(words, postags)
    l = []
    li = zip(list(words), list(postags), list(netags))
    for a, b, c in li:
        # 去掉命名实体
Пример #36
0
def main():

    f = open("psgs.txt", "r")
    lines = [line.rstrip() for line in f.readlines()]
    f.close()

    segmentor = Segmentor()
    segmentor.load(os.path.join(MODELDIR, "cws.model"))

    postagger = Postagger()
    postagger.load(os.path.join(MODELDIR, "pos.model"))

    f = open("../questions/q_facts_segged_clf.txt", "r")
    types = f.readlines()
    f.close()

    f = open("../questions/provided/q_facts.txt", "r")
    questions = [line.rstrip() for line in f.readlines()]
    f.close()

    f = open("psgs_segged.txt", "w")
    fans = open("zhidao_answer.txt", "w")
    i = 0
    qid = 0
    flag = 0

    while i < len(lines):
        line = lines[i]
        if (i % 50000 == 0):
            print "\r#\t%d" % i,
            sys.stdout.flush()
        if line.startswith("<question"):
            qid = int(line.split(" ")[1].split("=")[1].split(">")[0])
            flag = 0
            f.write(line + "\n")
        elif line.startswith("</doc") or line.startswith("</question"):
            f.write(line + "\n")
        elif line.startswith("<doc"):
            f.write(line + "\n" + lines[i+1] + "\n")
            i += 2
        else:
            L = len(line)
            s = 0
            for s in range(L):
                if line[s:].startswith("最佳答案:") \
                        or line[s:].startswith("[专业]答案")\
                        or line[s:].startswith("、"+questions[qid-1]):
                    break
            if line[s:].startswith("最佳答案"):
                s += 14
            elif line[s:].startswith("[专业]答案"):
                s += 15
            elif line[s:].startswith("、"+questions[qid-1]):
                s += len(questions[qid-1])+1
            if s < L and flag == 0:
                t = s + 1
                while t < L and line[t:].startswith("更多") == False\
                        and not (t+2<L and line[t]==" " and line[t+1] in "0123456789" and line[t+2] in "0123456789")\
                        and not line[t:].startswith("~")\
                        and not line[t:].startswith("?")\
                        and not line[t:].startswith("!")\
                        and not line[t:].startswith("。"):
                    t += 1
                if s < t and t-s < 200 and t-s > 1:
                    ans = line[s:t].rstrip(".。 ??,,")
                    if types[qid-1].rstrip() == "Q_number":
                        ans = first_con_number(ans)
                    fans.write("%d\t%s\n" % (qid, ans))
                    flag = 1
#            words = segmentor.segment(line)
#            postags = postagger.postag(words)
#            for j in range(len(words)):
#                f.write("%s/%s\t" % (words[j], postags[j]))
#            f.write("\n")
        i += 1
    f.close()
    fans.close()