示例#1
0
def seg_with_ltp40(in_file, out_file_path, manual_seg_file):
    # initialization model
    ltp = LTP()
    line_list = []

    # save seg_result
    corpus = construct_corpus(in_file)
    f = open(out_file_path, "w", encoding='utf-8')
    for line in corpus:
        line_list.append(line)  # 将每句话变成列表["Xxxx"]
        seg_result, hidden = ltp.seg(line_list)
        f.write("=".join(seg_result[0]) + "\n")
        line_list.clear()
        f.flush()

    # test qps
    corpus = construct_corpus(in_file, 1)
    start = time.time()
    for line in corpus:
        segment, hidden = ltp.seg(list(line))
    end = time.time()
    qps = round(len(corpus) / (end - start), 2)

    # test accuracy
    p, r, f1, line_aver_length = evaluate(out_file_path, manual_seg_file)
    return qps, p, r, f1, line_aver_length
示例#2
0
文件: api.py 项目: nju-websoft/CKGG
class Ner:
    def __init__(self):
        self.ltp = LTP()
    
    def preprocess(self, sent):
        return re.sub('\s+', '', sent)

    def ner(self, sents):
        assert not any(re.search(r'\s', x) for x in sents), "no space is allowed"
        psents = [x for x in sents if x != '']
        if len(psents) == 0:
            return [[] for x in sents]
        segment, hidden = self.ltp.seg(psents)
        ne = self.ltp.ner(hidden)
        anes = []
        for sseg, sne in zip(segment, ne):
            nes = []
            slens = [0] + [len(x) for x in sseg]
            for i in range(1, len(slens)):
                slens[i] += slens[i - 1]
            for t, x, y in sne:
                if t == 'Ns':
                    nes.append([slens[x], slens[y + 1]])
            anes.append(nes)
        fnes = []
        cur = 0
        for s in sents:
            if s == '':
                fnes.append([])
            else:
                fnes.append(anes[cur])
                cur += 1
        return fnes
示例#3
0
def ltp_func(text_list):
    ltp = LTP()
    seg, hidden = ltp.seg(text_list)
    pos = ltp.pos(hidden)
    result = []
    for idx, val in enumerate(seg[0]):
        pag = [val, pos[0][idx]]
        result.append('/'.join(pag))
    return result
示例#4
0
 def dependency(self):
     sentence = self.sentence
     sentences = []
     sentences.append(sentence)
     ltp = LTP()
     seg, hidden = ltp.seg(sentences)
     dep = ltp.dep(hidden)
     print(seg)
     print(dep)
     pass
示例#5
0
def work_summary_parser_ltp():
    f = csvReader("标准工作任务单")
    ltp = LTP()
    paList = []
    for i, row in enumerate(f):
        if i != 0:
            val = row[1][5:].split(',')
            paList.append(val[2])
    wa, ha = ltp.seg(paList)
    pa = ltp.pos(ha)
    return wa, pa
示例#6
0
 def findFood(self, sentence):
     ltp = LTP()
     words, hidden = ltp.seg([sentence])
     posTags = ltp.pos(hidden)
     words = words[0]  #分词结果list
     posTags = posTags[0]  #词性标注结果list
     dep = ltp.dep(hidden)[0]  #依存句法分析结果list
     relyId = [d[1] for d in dep]  #父节点id list
     relation = [d[2] for d in dep]  #关系结果 list
     heads = ['Root' if id == 0 else words[id - 1] for id in relyId]  #父节点内容
     string = ''
     for i in range(len(words)):
         if 'n' in posTags[i] and heads[i] == '吃' and relation[i] == 'VOB':
             string += words[i]
             string += ' '
     return string
示例#7
0
def findFood(sentence):
    ltp = LTP()
    words, hidden = ltp.seg([sentence])
    posTags = ltp.pos(hidden)
    words = words[0]  #分词结果list
    print(words)
    posTags = posTags[0]  #词性标注结果list
    print(posTags)
    dep = ltp.dep(hidden)[0]  #依存句法分析结果list
    for t in dep:
        print(t)
    relyId = [d[1] for d in dep]  #父节点id list
    relation = [d[2] for d in dep]  #关系结果 list
    heads = ['Root' if id == 0 else words[id - 1] for id in relyId]  #父节点内容
    for i in range(len(words)):
        if 'n' in posTags[i] and heads[i] == '吃' and relation[i] == 'VOB':
            print("找到了一种食物:" + words[i])
示例#8
0
def WriteTest(readfile, savefile):
    with open(readfile, "r", encoding="utf-8") as rfp:
        ltp = LTP()
        logger.info("Processing file:%s ." % (readfile))
        with open(savefile, 'w', encoding='utf-8') as wfp:

            for row in tqdm(rfp, desc="file %s process" % (readfile)):
                sent1, sent2 = row.split('\t')
                seg, hid = ltp.seg([sent1, sent2])
                sdp = ltp.sdp(hid, mode='tree')
                pos = ltp.pos(hid)
                tmpitem = {
                    'sentence1': [seg[0], pos[0], sdp[0]],
                    'sentence2': [seg[1], pos[1], sdp[1]]
                }
                jsonline = json.dumps(tmpitem)
                wfp.write(jsonline + "\n")
示例#9
0
class NamedEntity:
    def __init__(self, user_dict):
        self.ltp = LTP()  # 默认加载Small模型
        # user_dict.txt 是词典文件, max_window是最大前向分词窗口
        self.ltp.init_dict(path=user_dict, max_window=4)

    def entity_recognition(self, text: list):
        """
        命名实体识别
        :param text: 原始文本
        :return: 从原始文本中抽取的命名实体
        """
        seg, hidden = self.ltp.seg(text)   # 分词
        ner = self.ltp.ner(hidden)
        entity = []
        for tag, start, end in ner[0]:
            entity.append(seg[0][start:end+1][0])
        return entity
示例#10
0
    def is_word(sentence):
        from ltp import LTP
        r""" 
        Judge whether it is a word.

        :param str sentence: input sentence string
            sentence: input sentence string
        :return bool: is a word or not
        
        """
        if sentence[0] == sentence[1]:
            return True
        ltp = LTP()
        seg, hidden = ltp.seg([sentence])
        pos = ltp.pos(hidden)
        pos = pos[0]
        if len(pos) == 1 and pos[0] == 'n':
            return False
        return True
示例#11
0
def work_detail_parser_ltp():
    f = csvReader("标准工作任务单")
    ltp = LTP()
    paList = []
    pbList = []
    for i, row in enumerate(f):
        if i != 0:
            val = row[1][5:].split(',')
            paList.append(val[2])
            temp = val[3:]
            for v in temp:
                pbList.append(v)
    # print(paList)
    # print(pbList)
    sa, ha = ltp.seg(paList)
    sb, hb = ltp.seg(pbList)
    pa = ltp.pos(ha)
    pb = ltp.pos(hb)

    return sa, sb, pa, pb
示例#12
0
def prepare_ref(lines: List[str], ltp_tokenizer: LTP,
                bert_tokenizer: BertTokenizer):
    ltp_res = []

    for i in range(0, len(lines), 100):
        res = ltp_tokenizer.seg(lines[i:i + 100])[0]
        res = [get_chinese_word(r) for r in res]
        ltp_res.extend(res)
    assert len(ltp_res) == len(lines)

    bert_res = []
    for i in range(0, len(lines), 100):
        res = bert_tokenizer(lines[i:i + 100],
                             add_special_tokens=True,
                             truncation=True,
                             max_length=512)
        bert_res.extend(res["input_ids"])
    assert len(bert_res) == len(lines)

    ref_ids = []
    for input_ids, chinese_word in zip(bert_res, ltp_res):

        input_tokens = []
        for id in input_ids:
            token = bert_tokenizer._convert_id_to_token(id)
            input_tokens.append(token)
        input_tokens = add_sub_symbol(input_tokens, chinese_word)
        ref_id = []
        # We only save pos of chinese subwords start with ##, which mean is part of a whole word.
        for i, token in enumerate(input_tokens):
            if token[:2] == "##":
                clean_token = token[2:]
                # save chinese tokens' pos
                if len(clean_token) == 1 and _is_chinese_char(
                        ord(clean_token)):
                    ref_id.append(i)
        ref_ids.append(ref_id)

    assert len(ref_ids) == len(bert_res)

    return ref_ids
示例#13
0
def new_generate_ltp_results():
    # 加载模型
    ltp_model = '../../ltp_models/base1'
    ltp = LTP(path=ltp_model)

    # 读取原句子
    data = read_file_in_ltp('../data/train_base.json')
    sentences = list(map(lambda x: x['content'], data))

    segmented, pos, ner, srl, dep, sdp_tree, sdp_graph = [], [], [], [], [], [], []
    for sent in tqdm(sentences):
        # 分词
        segmented0, hidden = ltp.seg([sent])
        # 词性标注
        cur_pos = ltp.pos(hidden)
        # 命名实体识别
        cur_ner = ltp.ner(hidden)
        # 语义角色标注
        cur_srl = ltp.srl(hidden)
        # 依存句法分析
        cur_dep = ltp.dep(hidden)
        # 语义依存分析 (树)
        cur_sdp_tree = ltp.sdp(hidden, mode='tree')
        # 语义依存分析 (图)
        cur_sdp_graph = ltp.sdp(hidden, mode='graph')

        segmented.append(segmented0[0])
        pos.append(cur_pos[0])
        ner.append(cur_ner[0])
        srl.append(cur_srl[0])
        dep.append(cur_dep[0])
        sdp_tree.append(cur_sdp_tree[0])
        sdp_graph.append(cur_sdp_graph[0])

        # 生成句子与分词的对应
    sent_seg_matches = sentence_segment_match(data, segmented)
    pickle.dump([segmented, pos, ner, srl, dep, sdp_tree, sdp_graph, sent_seg_matches], open('new_ltp_results.pk', 'wb'))

    return segmented, pos, ner, srl, dep, sdp_tree, sdp_graph, sent_seg_matches
示例#14
0
def prepare_ref(lines: List[str],
                ltp_tokenizer: LTP,
                bert_tokenizer: BertTokenizer,
                batch_size=1000):
    """
    Args:
        lines:  每行一个中文段落,
        ltp_tokenizer: ltp的tokenizer处理器
        bert_tokenizer:  bert的tokenizer处理器
    Returns:

    """
    ltp_res = []
    # batch_size等于100,每次处理100行,
    print(f"开始用ltp模型进行分词处理...")
    for i in tqdm(range(0, len(lines), batch_size)):
        #调用ltp进行分词
        res = ltp_tokenizer.seg(lines[i:i + batch_size])[0]
        #过滤出分词后都是中文的部分
        res = [get_chinese_word(r) for r in res]
        #加到ltp_res
        ltp_res.extend(res)
    assert len(ltp_res) == len(lines)
    # eg: ltp_res中的文本处理的结果 [ ['效果', '一直', '用户', '感觉'],....]
    #bert也进行tokenizer, 每次处理100行
    print(f"开始用bert tokenizer模型进行token处理...")
    bert_res = []
    for i in tqdm(range(0, len(lines), batch_size)):
        res = bert_tokenizer(lines[i:i + batch_size],
                             add_special_tokens=True,
                             truncation=True,
                             max_length=512)
        bert_res.extend(res["input_ids"])
    # eg: bert_res [ [101, 5439, 4500, 2787, 749, 8024, 671, 4684, 1762, 4500, 4007, 2051, 8024, 2697, 6230, 2190, 2971, 4576, 2971, 3779, 3126, 3362, 2923, 1962, 4638, 102]...]
    #确保行数相同
    print(f"开始生成对应关系")
    assert len(bert_res) == len(lines)
    print_num = 5
    ref_ids = []
    for input_ids, chinese_word in zip(bert_res, ltp_res):
        input_tokens = []
        for id in input_ids:
            token = bert_tokenizer._convert_id_to_token(id)
            input_tokens.append(token)
        # eg : ['[CLS]', '古', '##龙', '洗', '发', '##水', ',', '洗', '完', '头', '##发', '不', '干', '##燥', '、', '也', '不', '容', '##易', '油', '、', '不', '痒', ',', '味', '##道', '持', '##久', ',', '非', '##常', '柔', '##顺', ',', '而', '##且', '泡', '##泡', '很', '容', '##易', '冲', '##洗', '干', '##净', '泡', '##沫', '非', '##常', '细', '##腻', ',', '洗', '后', '头', '##发', '很', '滑', '很', '顺', ',', '洗', '了', '之', '##后', '就', '头', '##发', '很', '蓬', '##松', ',', '很', '香', ',', '而', '##且', '我', '洗', '了', '是', '没', '##有', '头', '##皮', '##屑', '的', '[SEP]']
        input_tokens = add_sub_symbol(input_tokens, chinese_word)
        ref_id = []
        # 我们只保存以##开头的中文子词的位置,这意味着它是全词的一部分。
        for i, token in enumerate(input_tokens):
            if token[:2] == "##":
                clean_token = token[2:]
                # 只保存中文子词的后半部分,把和bert的对应关系,保存到ref_id中,ref_id是这个句子的所有子词的后半部分映射
                if len(clean_token) == 1 and _is_chinese_char(
                        ord(clean_token)):
                    ref_id.append(i)
        #打印前5个示例
        if print_num > 0:
            example_num = 5 - print_num
            print(f"第{example_num}个样本是: {lines[example_num]}")
            print(f"第{example_num}个样本的ltp分词后结果: {ltp_res[example_num]}")
            print(
                f"第{example_num}个样本的bert toknizer后结果: {bert_res[example_num]}")
            print(
                f"第{example_num}个样本的bert toknizer被ltp的全词处理后的结果: {input_tokens}"
            )
            print(
                f"第{example_num}个样本的bert的token对应的子词的后半部分的位置的最终的ref_id: {ref_id}"
            )
            print()
            print_num -= 1
        ref_ids.append(ref_id)
    #判断每个句子的子词的映射关系都保存了
    assert len(ref_ids) == len(bert_res)

    return ref_ids
示例#15
0
from ltp import LTP
import time

start = time.time()  # 紀錄執行時間

ltp = LTP()

segment, hidden = ltp.seg(["這隻程式可以幫我們把網站資料爬下來"])

pos = ltp.pos(hidden)
# ner = ltp.ner(hidden)
# srl = ltp.srl(hidden)
# dep = ltp.dep(hidden)
# sdp = ltp.sdp(hidden)

print(segment)
# print(hidden)
print(pos)

end = time.time()
print(end - start)
示例#16
0
文件: huizong.py 项目: yf1291/nlp4
kglist = ['大学', '人口', '面积']
text = '姚明的妻子的丈夫的妻子'
text = '我现在在天津,这里有什么大学?'
text = '姚明的妻子'

##
#--------从测试看出来,ner本身对问题有干扰,所以在kglist里面要去除.

# tiaozhuan=searchKG(kglist=['地点','地址','大小','颜色','老婆','丈夫'],text='我家住在和平区哪个地方')

# print(tiaozhuan,"jieguo shi !!!!!!!!!!!!!!!!")
##

# 加入句子成分跳转.
seg, hidden = ltp.seg([text])
# sdp = ltp.sdp(hidden, graph=False)

print(seg, "seg")
pos = ltp.pos(hidden)
ner = ltp.ner(hidden)
print("ner", ner)
srl = ltp.srl(hidden)
dep = ltp.dep(hidden)
sdp = ltp.sdp(hidden)

print(ner, "ner结果")
seg = seg[0]
dep = dep[0]
sdp = sdp[0]
print(sdp, "语义分析!!!!!!!!!!!!!!!!!!!")  # 太难用了.
示例#17
0
文件: server.py 项目: ztzdxqj/ltp
class Server(object):
    def __init__(self,
                 path: str = 'small',
                 batch_size: int = 50,
                 device: str = None,
                 onnx: bool = False):
        self.ltp = LTP(path=path, device=device)
        self.split = lambda a: map(lambda b: a[b:b + batch_size],
                                   range(0, len(a), batch_size))

    def _build_words(self, words, pos, dep):
        res = [{'id': -1, 'length': 0, 'offset': 0, 'text': 'root'}]
        for word, p, (id, parent, relation) in zip(words, pos, dep):
            offset = res[-1]['offset'] + res[-1]['length']
            res.append({
                'id': id - 1,
                'length': len(word),
                'offset': offset,
                'text': word,
                'pos': p,
                'parent': parent - 1,
                'relation': relation,
                'roles': [],
                'parents': []
            })

        return res[1:]

    def _predict(self, sentences: List[str]):
        result = []
        for sentences_batch in self.split(sentences):
            batch_seg, hidden = self.ltp.seg(sentences_batch)
            batch_pos = self.ltp.pos(hidden)
            batch_ner = self.ltp.ner(hidden)
            batch_srl = self.ltp.srl(hidden)
            batch_dep = self.ltp.dep(hidden, fast=False)
            batch_sdp = self.ltp.sdp(hidden, mode='mix')

            for sent, seg, pos, ner, srl, dep, sdp in \
                    zip(sentences_batch, batch_seg, batch_pos, batch_ner, batch_srl, batch_dep, batch_sdp):

                words = self._build_words(seg, pos, dep)

                for word, token_srl in zip(words, srl):
                    for role, start, end in token_srl:
                        text = "".join(seg[start:end + 1])
                        offset = words[start]['offset']
                        word['roles'].append({
                            'text': text,
                            'offset': offset,
                            'length': len(text),
                            'type': role
                        })

                for start, end, label in sdp:
                    words[start - 1]['parents'].append({
                        'parent': end - 1,
                        'relate': label
                    })

                nes = []
                for role, start, end in ner:
                    text = "".join(seg[start:end + 1])
                    nes.append({
                        'text': text,
                        'offset': start,
                        'ne': role.lower(),
                        'length': len(text)
                    })

                result.append({'text': sent, 'nes': nes, 'words': words})

        return result

    def serve(self, port: int = 5000, n_process: int = None):
        if n_process is None:
            n_process = 1 if sys.platform == 'win32' else 8

        fmt = LogFormatter(fmt='%(asctime)s - %(levelname)s - %(message)s',
                           datefmt='%Y-%m-%d %H:%M:%S',
                           color=True)
        root_logger = logging.getLogger()

        console_handler = logging.StreamHandler()
        file_handler = logging.FileHandler('server.log')

        console_handler.setFormatter(fmt)
        file_handler.setFormatter(fmt)

        root_logger.addHandler(console_handler)
        root_logger.addHandler(file_handler)

        app_log.setLevel(logging.INFO)
        gen_log.setLevel(logging.INFO)
        access_log.setLevel(logging.INFO)

        app_log.info("Model is loading...")
        app_log.info("Model Has Been Loaded!")

        app = Application([(r"/.*", LTPHandler, dict(ltp=self))])

        server = HTTPServer(app)
        server.bind(port)
        server.start(n_process)
        ioloop.IOLoop.instance().start()
def save_as_txt(data):
    from ltp import LTP
    import random
    ltp = LTP()
    for row in data:
        id = row[0]
        school_id = ("000" + str(row[1]))[-4:]
        texts = row[2]
        textlines = texts.split('\n')
        shortened_textlines = []
        for line in textlines:
            line_len = len(line)
            if line_len > 100:
                for i in range(line_len // 100):
                    shortened_textlines.append(line[i * 100:(i + 1) * 100])
            else:
                shortened_textlines.append(line)
        text = ' '.join(shortened_textlines)
        path = './data/' + str(school_id)
        if os.path.exists(path): pass
        else: os.makedirs(path)
        with open((path + '/' + str(school_id) + "-" + str(id) + ".txt"),
                  'w',
                  encoding='UTF-8') as file:
            file.write(text)
            file.close()
            print("\r已保存 " + str(school_id) + "-" + str(id) + ".txt", end="")
            # T2	报告人 68 71	曹进德
            # R2 报告人_单位 Arg1: T2 Arg2: T1
        seg, hidden = ltp.seg([text])
        ner = ltp.ner(hidden)
        ner_info = []
        entities_nh = []
        entities_ni = []
        print(type(text))
        print()
        for i in ner[0]:
            if (i[0] == 'Nh'):
                start = i[1]
                end = i[2]
                entity = "".join(seg[0][start:end + 1])
                if (len(entity) > 1):
                    entities_nh.append(entity)

            elif (i[0] == 'Ni'):
                start = i[1]
                end = i[2]
                entity = "".join(seg[0][start:end + 1])
                if entity in schoolnames:
                    entities_ni.append(entity)

        for entity in set(entities_nh):
            pattern = re.compile(entity)
            iter = pattern.finditer(text)
            count = 0
            for record in iter:
                ner_info.append("T" + str(300 + count) + "\t姓名 " +
                                str(record.span()[0]) + " " +
                                str(record.span()[1]) + "\t" +
                                str(record.group()) + "\n")
                count += 1

        for entity in set(entities_ni):
            pattern = re.compile(entity)
            iter = pattern.finditer(text)
            count = 0
            for record in iter:
                ner_info.append("T" + str(400 + count) + "\t单位 " +
                                str(record.span()[0]) + " " +
                                str(record.span()[1]) + "\t" +
                                str(record.group()) + "\n")
                count += 1

        pattern = re.compile('教授|副教授|讲师|研究员|副研究员|助理教授|助理研究员')
        iter = pattern.finditer(text)
        count = 0
        for record in iter:
            ner_info.append("T" + str(500 + count) + "\t职称 " +
                            str(record.span()[0]) + " " +
                            str(record.span()[1]) + "\t" +
                            str(record.group()) + "\n")
            count += 1

        date_1 = r"([0-9]+年[0-9]+月[0-9]+日)"  # |([0-9]+月[0-9]+日)
        date_2 = r"([零〇一二三四五六七八九]年[十]?[一二三四五六七八九]月[一二三]?[十]?[一二三四五六七八九十]日)"
        date_3 = r"([0-9]+月[0-9]+日)"
        flag = False
        count = 0
        ## 方式1
        pattern = re.compile(date_1)
        iter = pattern.finditer(text)
        for record in iter:
            ner_info.append("T" + str(600 + count) + "\t日期 " +
                            str(record.span()[0]) + " " +
                            str(record.span()[1]) + "\t" +
                            str(record.group()) + "\n")
            count += 1
            flag = True

        if (flag is False):
            pattern = re.compile(date_3)
            iter = pattern.finditer(text)
            for record in iter:
                ner_info.append("T" + str(600 + count) + "\t日期 " +
                                str(record.span()[0]) + " " +
                                str(record.span()[1]) + "\t" +
                                str(record.group()) + "\n")
                count += 1

        ## 方式2
        pattern = re.compile(date_2)
        iter = pattern.finditer(text)
        for record in iter:
            ner_info.append("T" + str(600 + count) + "\t日期 " +
                            str(record.span()[0]) + " " +
                            str(record.span()[1]) + "\t" +
                            str(record.group()) + "\n")
            count += 1

        with open((path + '/' + str(school_id) + "-" + str(id) + ".ann"),
                  'w',
                  encoding='UTF-8') as file:
            print([text])
            print(ner_info)
            file.writelines(ner_info)
            file.close()
            print("\r已保存 " + str(school_id) + "-" + str(id) + ".ann", end="")
示例#19
0
# coding:utf-8
# 这里是ltp的运行示例,直接运行即可
from ltp import LTP

ltp = LTP()

seg, hidden = ltp.seg(["他叫汤姆去拿外衣。"])
pos = ltp.pos(hidden)
print(seg[0])
print(pos[0])
result = []
for idx, val in enumerate(seg[0]):
    pag = [val, pos[0][idx]]
    result.append(pag)
print(result)
示例#20
0
# import synonyms
import json

# sen1 = "程序员"
# sen2 = "软件工程师"
# r = synonyms.compare(sen1, sen2, seg=True)
# print(r)

# ddp = DDParser()
# # 单条句子
# re = ddp.parse("语文老师")
# print(re)

from ltp import LTP
ltp = LTP()  # 默认加载 Small 模型
seg, hidden = ltp.seg(["语文老师"])
pos = ltp.pos(hidden)
ner = ltp.ner(hidden)
srl = ltp.srl(hidden)
dep = ltp.dep(hidden)
sdp = ltp.sdp(hidden)

print(seg)
# print(hidden)
print(pos)
print(pos)
print(ner)
print(srl)
print(dep)
示例#21
0
class CnProcessor:
    r"""
        Text Processor class implement NER.
    """
    _instance_lock = threading.Lock()

    def __init__(self):
        self.__ner = None
        self.__pos = None

    # Single instance mode
    def __new__(cls, *args, **kwargs):
        if not hasattr(CnProcessor, "_instance"):
            with CnProcessor._instance_lock:
                if not hasattr(CnProcessor, "_instance"):
                    CnProcessor._instance = object.__new__(cls)
        return CnProcessor._instance

    @staticmethod
    def word_tokenize(sent):
        r"""
        tokenize fiction

        :param str sent: the sentence need to be tokenized
        :return: list.the tokens in it
        """
        assert isinstance(sent, str)

        return [word for word in sent]

    def get_ner(self, sentence):
        r"""
        NER function.

        :param str sent: the sentence need to be ner
        :return two forms of tags
            The first is the triple form (tags,start,end)
            The second is the list form, which marks the ner label of each word
            such as 周小明去玩
            ['Nh', 'Nh', 'Nh', 'O', 'O']
        """
        assert isinstance(sentence, (list, str))
        from ltp import LTP
        if isinstance(sentence, list):
            # Turn the list into sentence
            tmp = ''
            for word in sentence:
                tmp += word
            sentence = tmp

        if not sentence:
            return [], []

        if self.__ner is None:
            self.__ner = LTP()
        seg, hidden = self.__ner.seg([sentence])
        seg = seg[0]
        ner = self.__ner.ner(hidden)
        ner = ner[0]

        ner_label = len(sentence) * ['O']
        for i in range(len(ner)):
            tag, start, end = ner[i]
            tmp = 0
            for j in range(start):
                tmp += len(seg[j])
            start = tmp
            tmp = 0
            for j in range(end + 1):
                tmp += len(seg[j])
            end = tmp
            ner[i] = (tag, start, end - 1)
            for j in range(start, end):
                ner_label[j] = tag

        return ner, ner_label

    def get_pos_tag(self, sentence):
        r"""
        pos tag function.

        :param str sentence: the sentence need to be ner
        :return: the triple form (tags,start,end)
        """

        assert isinstance(sentence, (list, str))
        from ltp import LTP
        if isinstance(sentence, list):
            # Turn the list into sentence
            tmp = ''
            for word in sentence:
                tmp += word
            sentence = tmp

        if not sentence:
            return []

        if self.__pos is None:
            # get pos tag
            self.__pos = LTP()
        seg, hidden = self.__pos.seg([sentence])
        pos = self.__pos.pos(hidden)
        seg = seg[0]
        pos = pos[0]
        pos_tag = []
        cnt = 0
        for tag in range(len(pos)):
            pos_tag.append([pos[tag], cnt, cnt + len(seg[tag]) - 1])
            cnt += len(seg[tag])

        return pos_tag
示例#22
0
proxies = {'http': 'http://localhost:8888', 'https': 'http://localhost:8888'}

#初始化分词工具
ltp = LTP(proxies=proxies)

sentences = pdfreader.getTestFromPdf()['Text']
seg = []
sdp = []
dep = []
pos = []
cluster = []

#对句子进行分词以及语义依存分析
for st in sentences:
    if (st != ''):
        seg_temp, hidden = ltp.seg([st])
        #  获得语义依存关系
        sdp.append(ltp.sdp(hidden)[0])
        #获得词性列表
        pos.append(ltp.pos(hidden)[0])
        #获得分词列表
        seg.append(seg_temp[0])
        #获得语法依存关系
        dep.append(ltp.dep(hidden)[0])
    #初始化储存三元组的list
    resultTriad = []

for index in range(len(dep)):
    r = getTriad(dep[index], seg[index], pos[index])
    resultTriad.append(r)
示例#23
0
from ltp import LTP
from config import LTP4_MODEL_DIR
from ESIServer.component.open_relation_extraction.nlp import NLP
nlp = NLP()


class NLPLTP:
    def __init__(self, default_model_dir=LTP4_MODEL_DIR):
        print(default_model_dir)
        self.ltp = LTP(path=default_model_dir)


if __name__ == '__main__':
    ltp = LTP(path=LTP4_MODEL_DIR)
    seg, hidden = ltp.seg(["他叫汤姆去拿外衣。", "他就读于复旦大学。", "吴秀波diss李明"])
    pos = ltp.pos(hidden)
    ner = ltp.ner(hidden)
    dep = ltp.dep(hidden)
    srl = ltp.srl(hidden)
    sdp = ltp.sdp(hidden)

    print(seg)
    print(pos)
    print(ner)
    print(dep)
    print(srl)
    print(sdp)

    origin_sentences = ["他叫汤姆去拿外衣。", "他就读于复旦大学。"]
    lemmas, hidden = nlp.segment(origin_sentences)
    words_postag = nlp.postag(lemmas, hidden)
class EntityDescribeExtractByRoleAnalysisV1(object):
    """
        基于ltp 角色分析进行实体和实体描述信息抽取
    """
    def __init__(self, ltp_model_path="tiny"):
        self.ltp = LTP(ltp_model_path)

    def single_sentence(self, input_sentence, ind=0):
        seg, hidden = self.ltp.seg([input_sentence])
        words = seg[ind]

        pos = self.ltp.pos(hidden)[ind]
        roles = self.ltp.srl(hidden, keep_empty=False)[ind]

        filter_p = {"是", "为"}
        role_list = ["A0", "A1", "A2", "A3", "A5"]
        # print(words)
        spo_list = []
        for role in roles:
            r_indx, r_list = role

            p_value = words[r_indx]
            r_list = list(filter(lambda x: x[0] in role_list, r_list))
            if len(r_list) != 2:
                continue
            sub = r_list[0]
            obj = r_list[1]

            if sub[0] not in role_list:
                continue
            if obj[0] not in role_list:
                continue
            if sub[2] >= r_indx:
                continue
            if obj[1] <= r_indx:
                continue
            # 谓语过滤
            if p_value not in filter_p:
                continue
            if p_value == "为":
                sub, obj = obj, sub

            # 词性过滤
            if pos[sub[2]] not in ["n", "nz"]:
                continue

            sub_value = words[sub[1]:sub[2] + 1]

            obj_value = words[obj[1]:obj[2] + 1]

            # print("".join(sub_value), p_value, "".join(obj_value))
            spo_list.append(("".join(sub_value), p_value, "".join(obj_value)))

        return spo_list

    def extract_info(self, input_sentence_list):
        """ 抽取实体描述信息
        Args:
            input_sentence_list:

        Returns:
            entity_describe_res: List[{"sentence": xxx, "entity": xxx, "describe":xxx}]

        """
        entity_describe_res = []
        for i, sentence in enumerate(input_sentence_list):
            sentence = sentence.strip()
            if len(sentence) < 10:
                continue
            if len(sentence) > 100:
                continue
            if not re.fullmatch("^[\u4e00-\u9fa5_a-zA-Z]{1,15}是.+$", sentence):
                continue
            out_spo_list = self.single_sentence(sentence)

            for spo in out_spo_list:
                entity_describe_res.append({
                    "sentence": sentence,
                    "entity": spo[0],
                    "describe": spo[2]
                })
        return entity_describe_res

    # def single_sentence_v2(self, input_sentence):
    #     sentence_feature = [(cut.DEPREL, cut.LEMMA) for cut in HanLP.parseDependency(input_sentence)]
    #     if sentence_feature[0][0] != "主谓关系":
    #         return True
    #     if ("核心关系", "是") not in sentence_feature:
    #         return True
    #     return False

    def multi_extract_info(self, input_sentence_list):
        pool = multiprocessing.Pool(processes=3)
        spo_res = []
        for i, sentence in enumerate(input_sentence_list):
            sentence = sentence.strip()
            if len(sentence) == 0:
                continue

            out_spo_list = pool.apply_async(self.single_sentence, (sentence, ))
            # out_spo_list = self.single_sentence(sentence)
            spo_res.append(out_spo_list)
            # spo_res.append((sentence, out_spo_list))
        pool.close()
        pool.join()

        spo_res = [spo.get() for i, spo in enumerate(spo_res)]
        return spo_res
示例#25
0
def text_work_summary_parser_ltp(textList):
    ltp = LTP()
    wa, ha = ltp.seg(textList)
    pa = ltp.pos(ha)
    return wa, pa
示例#26
0
# sentence = '整天不是吃饭就是睡觉,活得真像一头猪。'
# sentence = '整天睡觉,活得真像一头猪。'
# sentence = 'x要人名币。'
# sentence = '一切动物和植物都是生物。'
# sentence = '美丽的小猪和优雅的小熊,是一对好朋友。'
# sentence = '任意的整数和任意的浮点数的乘积是浮点数。'
# sentence = '并非x0都是偶数。'
# sentence = '并非每个自然数都是偶数。'
# sentence = '1+1*3*(2+4)'
# sentence = '(1+1)*3/(2+4)'
# sentence = '(4-3)/(5-3)'
# sentence = '4-3*5+2'
# sentence = '4-3*5'
sentence = '4*3'

seg, hidden = ltp.seg([sentence])
pos = ltp.pos(hidden)
sdp = ltp.sdp(hidden)
srl = ltp.srl(hidden, keep_empty=False)
dep = ltp.dep(hidden)
print(seg)
print(pos)
print(sdp)
print(srl)
print(dep)

# a = [(1, [(2, (3, 4)), (2, (3, 4))]), (10, [(20, (30, 40)), (20, (30, 40))])]
# b = []

# def re(a):
#     x = type(a)
示例#27
0
             "运营管理问题":6,
             "程序规章手册缺陷":7,
             "安检空管维修资质等其它":8}

myclasses = ["航空器系统/部件失效",
             "航空器设计制造缺陷",
             "机务人员致灾",
             "机组人员致灾",
             "零件生产质量问题",
             "运营管理问题",
             "程序规章手册缺陷",
             "安检空管维修资质等其它"]
from ltp import LTP
ltp = LTP()

X_dataset, _ = ltp.seg(X.tolist())
X_dataset = np.array(X_dataset, dtype=object)
y_dataset = [[],[],[],[],[],[],[],[]]

for i in range(len(myclasses)):
    for item in y.tolist():
        if myclasses[i] in item:
           y_dataset[i].append(1)
        else:
            y_dataset[i].append(0)

y_datatset = np.mat(y_dataset)

np.save("X_ltp.npy", X_dataset)
np.save("y_ltp.npy",y_dataset)
示例#28
0
class Server(object):
    def __init__(self,
                 path: str = 'small',
                 batch_size: int = 50,
                 device: str = None,
                 onnx: bool = False):
        if onnx:
            self.ltp = FastLTP(path=path, device=device)
        else:
            self.ltp = LTP(path=path, device=device)
        self.split = lambda a: map(lambda b: a[b:b + batch_size],
                                   range(0, len(a), batch_size))

    def _build_words(self, words, pos, dep):
        res = [{'id': -1, 'length': 0, 'offset': 0, 'text': 'root'}]
        for word, p, (id, parent, relation) in zip(words, pos, dep):
            offset = res[-1]['offset'] + res[-1]['length']
            res.append({
                'id': id - 1,
                'length': len(word),
                'offset': offset,
                'text': word,
                'pos': p,
                'parent': parent - 1,
                'relation': relation,
                'roles': [],
                'parents': []
            })

        return res[1:]

    def _predict(self, sentences: List[str]):
        result = []
        for sentences_batch in self.split(sentences):
            batch_seg, hidden = self.ltp.seg(sentences_batch)
            batch_pos = self.ltp.pos(hidden)
            batch_ner = self.ltp.ner(hidden)
            batch_srl = self.ltp.srl(hidden)
            batch_dep = self.ltp.dep(hidden)
            batch_sdp = self.ltp.sdp(hidden)

            for sent, seg, pos, ner, srl, dep, sdp in \
                    zip(sentences_batch, batch_seg, batch_pos, batch_ner, batch_srl, batch_dep, batch_sdp):

                words = self._build_words(seg, pos, dep)

                for word, token_srl in zip(words, srl):
                    for role, start, end in token_srl:
                        text = "".join(seg[start:end + 1])
                        offset = words[start]['offset']
                        word['roles'].append({
                            'text': text,
                            'offset': offset,
                            'length': len(text),
                            'type': role
                        })

                for start, end, label in sdp:
                    words[start - 1]['parents'].append({
                        'parent': end - 1,
                        'relate': label
                    })

                nes = []
                for role, start, end in ner:
                    text = "".join(seg[start:end + 1])
                    nes.append({
                        'text': text,
                        'offset': start,
                        'ne': role.lower(),
                        'length': len(text)
                    })

                result.append({'text': sent, 'nes': nes, 'words': words})

        return result
示例#29
0
from ltp import LTP
ltp = LTP()
segment, hidden = ltp.seg(["南京市长江大桥。"])
print(segment)

sentences = ltp.sent_split(["南京市长江大桥。", "汤姆生病了。他去医院了。"])
print(sentences)

segment, hidden = ltp.seg(sentences)
print(segment)
print(hidden)

pos_tags = ltp.pos(hidden)
print(pos_tags)

示例#30
0
class Run(object):
    def __init__(self,
                 path: str = 'small',
                 batch_size: int = 50,
                 device: str = None,
                 onnx: bool = False):
        if onnx:
            self.ltp = FastLTP(path=path, device=device, need_config=True)
        else:
            self.ltp = LTP(path=path, device=device, need_config=True)
        self.split = lambda a: map(lambda b: a[b:b + batch_size],
                                   range(0, len(a), batch_size))

    def _build_words(self, words, pos, dep):
        res = [{'id': -1, 'length': 0, 'offset': 0, 'text': 'root'}]
        for word, p, (id, parent, relation) in zip(words, pos, dep):
            offset = res[-1]['offset'] + res[-1]['length']
            res.append({
                'id': id - 1,
                'length': len(word),
                'offset': offset,
                'text': word,
                'pos': p,
                'parent': parent - 1,
                'relation': relation,
                'roles': [],
                'parents': []
            })

        return res[1:]

    def _predict(self, sentences: List[str]):
        result = []
        for sentences_batch in self.split(sentences):
            batch_seg, hidden = self.ltp.seg(sentences_batch)
            batch_pos = self.ltp.pos(hidden)
            batch_ner = self.ltp.ner(hidden)
            batch_srl = self.ltp.srl(hidden)
            batch_dep = self.ltp.dep(hidden)
            batch_sdp = self.ltp.sdp(hidden)

            for sent, seg, pos, ner, srl, dep, sdp in \
                    zip(sentences_batch, batch_seg, batch_pos, batch_ner, batch_srl, batch_dep, batch_sdp):

                words = self._build_words(seg, pos, dep)

                for word, token_srl in zip(words, srl):
                    for role, start, end in token_srl:
                        text = "".join(seg[start:end + 1])
                        offset = words[start]['offset']
                        word['roles'].append({
                            'text': text,
                            'offset': offset,
                            'length': len(text),
                            'type': role
                        })

                for start, end, label in sdp:
                    words[start - 1]['parents'].append({
                        'parent': end - 1,
                        'relate': label
                    })

                nes = []
                for role, start, end in ner:
                    text = "".join(seg[start:end + 1])
                    nes.append({
                        'text': text,
                        'offset': start,
                        'ne': role.lower(),
                        'length': len(text)
                    })

                result.append({'text': sent, 'nes': nes, 'words': words})

        return result

    def test(self, sentences: List[str] = None):
        self.ltp.add_words("DMI与主机通讯中断")
        if sentences is None:
            sentences = ["他叫汤姆去拿外衣。"]
        res = self._predict([sentence.strip() for sentence in sentences])
        print(json.dumps(res, indent=2, sort_keys=True, ensure_ascii=False))

    def save(self, out='ltp.npz'):
        import numpy as np
        nps = {}
        for k, v in self.ltp.model.state_dict().items():
            k = k.replace("gamma", "weight").replace("beta", "bias")
            nps[k] = np.ascontiguousarray(v.cpu().numpy())

        np.savez(out, **nps)

        config = self.ltp.config
        with open('config.json', 'w', encoding='utf-8') as f:
            json.dump(config, f, indent=2)

    def test_seged(self):
        import torch
        sentences = [
            'My name is tom.', 'He called Tom to get coats.', '他叫Tom去拿外衣。',
            '他叫汤姆去拿外衣。', "我去长江大桥玩。"
        ]
        seg, hidden = self.ltp.seg(sentences)
        seged, hidden_seged = self.ltp.seg(seg, is_preseged=True)
        hidden: dict
        hidden_seged: dict
        for key, value in hidden.items():
            if isinstance(value, torch.Tensor):
                test = torch.sum(value.float() -
                                 hidden_seged[key].float()).numpy()
                print(key, test)

        print(seg == seged)