Пример #1
0
def new_generate_ltp_results():
    # 加载模型
    ltp_model = '../../ltp_models/base1'
    ltp = LTP(path=ltp_model)

    # 读取原句子
    data = read_file_in_ltp('../data/train_base.json')
    sentences = list(map(lambda x: x['content'], data))

    segmented, pos, ner, srl, dep, sdp_tree, sdp_graph = [], [], [], [], [], [], []
    for sent in tqdm(sentences):
        # 分词
        segmented0, hidden = ltp.seg([sent])
        # 词性标注
        cur_pos = ltp.pos(hidden)
        # 命名实体识别
        cur_ner = ltp.ner(hidden)
        # 语义角色标注
        cur_srl = ltp.srl(hidden)
        # 依存句法分析
        cur_dep = ltp.dep(hidden)
        # 语义依存分析 (树)
        cur_sdp_tree = ltp.sdp(hidden, mode='tree')
        # 语义依存分析 (图)
        cur_sdp_graph = ltp.sdp(hidden, mode='graph')

        segmented.append(segmented0[0])
        pos.append(cur_pos[0])
        ner.append(cur_ner[0])
        srl.append(cur_srl[0])
        dep.append(cur_dep[0])
        sdp_tree.append(cur_sdp_tree[0])
        sdp_graph.append(cur_sdp_graph[0])

        # 生成句子与分词的对应
    sent_seg_matches = sentence_segment_match(data, segmented)
    pickle.dump([segmented, pos, ner, srl, dep, sdp_tree, sdp_graph, sent_seg_matches], open('new_ltp_results.pk', 'wb'))

    return segmented, pos, ner, srl, dep, sdp_tree, sdp_graph, sent_seg_matches
Пример #2
0
def WriteTest(readfile, savefile):
    with open(readfile, "r", encoding="utf-8") as rfp:
        ltp = LTP()
        logger.info("Processing file:%s ." % (readfile))
        with open(savefile, 'w', encoding='utf-8') as wfp:

            for row in tqdm(rfp, desc="file %s process" % (readfile)):
                sent1, sent2 = row.split('\t')
                seg, hid = ltp.seg([sent1, sent2])
                sdp = ltp.sdp(hid, mode='tree')
                pos = ltp.pos(hid)
                tmpitem = {
                    'sentence1': [seg[0], pos[0], sdp[0]],
                    'sentence2': [seg[1], pos[1], sdp[1]]
                }
                jsonline = json.dumps(tmpitem)
                wfp.write(jsonline + "\n")
Пример #3
0
class Server(object):
    def __init__(self,
                 path: str = 'small',
                 batch_size: int = 50,
                 device: str = None,
                 onnx: bool = False):
        self.ltp = LTP(path=path, device=device)
        self.split = lambda a: map(lambda b: a[b:b + batch_size],
                                   range(0, len(a), batch_size))

    def _build_words(self, words, pos, dep):
        res = [{'id': -1, 'length': 0, 'offset': 0, 'text': 'root'}]
        for word, p, (id, parent, relation) in zip(words, pos, dep):
            offset = res[-1]['offset'] + res[-1]['length']
            res.append({
                'id': id - 1,
                'length': len(word),
                'offset': offset,
                'text': word,
                'pos': p,
                'parent': parent - 1,
                'relation': relation,
                'roles': [],
                'parents': []
            })

        return res[1:]

    def _predict(self, sentences: List[str]):
        result = []
        for sentences_batch in self.split(sentences):
            batch_seg, hidden = self.ltp.seg(sentences_batch)
            batch_pos = self.ltp.pos(hidden)
            batch_ner = self.ltp.ner(hidden)
            batch_srl = self.ltp.srl(hidden)
            batch_dep = self.ltp.dep(hidden, fast=False)
            batch_sdp = self.ltp.sdp(hidden, mode='mix')

            for sent, seg, pos, ner, srl, dep, sdp in \
                    zip(sentences_batch, batch_seg, batch_pos, batch_ner, batch_srl, batch_dep, batch_sdp):

                words = self._build_words(seg, pos, dep)

                for word, token_srl in zip(words, srl):
                    for role, start, end in token_srl:
                        text = "".join(seg[start:end + 1])
                        offset = words[start]['offset']
                        word['roles'].append({
                            'text': text,
                            'offset': offset,
                            'length': len(text),
                            'type': role
                        })

                for start, end, label in sdp:
                    words[start - 1]['parents'].append({
                        'parent': end - 1,
                        'relate': label
                    })

                nes = []
                for role, start, end in ner:
                    text = "".join(seg[start:end + 1])
                    nes.append({
                        'text': text,
                        'offset': start,
                        'ne': role.lower(),
                        'length': len(text)
                    })

                result.append({'text': sent, 'nes': nes, 'words': words})

        return result

    def serve(self, port: int = 5000, n_process: int = None):
        if n_process is None:
            n_process = 1 if sys.platform == 'win32' else 8

        fmt = LogFormatter(fmt='%(asctime)s - %(levelname)s - %(message)s',
                           datefmt='%Y-%m-%d %H:%M:%S',
                           color=True)
        root_logger = logging.getLogger()

        console_handler = logging.StreamHandler()
        file_handler = logging.FileHandler('server.log')

        console_handler.setFormatter(fmt)
        file_handler.setFormatter(fmt)

        root_logger.addHandler(console_handler)
        root_logger.addHandler(file_handler)

        app_log.setLevel(logging.INFO)
        gen_log.setLevel(logging.INFO)
        access_log.setLevel(logging.INFO)

        app_log.info("Model is loading...")
        app_log.info("Model Has Been Loaded!")

        app = Application([(r"/.*", LTPHandler, dict(ltp=self))])

        server = HTTPServer(app)
        server.bind(port)
        server.start(n_process)
        ioloop.IOLoop.instance().start()
Пример #4
0
# tiaozhuan=searchKG(kglist=['地点','地址','大小','颜色','老婆','丈夫'],text='我家住在和平区哪个地方')

# print(tiaozhuan,"jieguo shi !!!!!!!!!!!!!!!!")
##

# 加入句子成分跳转.
seg, hidden = ltp.seg([text])
# sdp = ltp.sdp(hidden, graph=False)

print(seg, "seg")
pos = ltp.pos(hidden)
ner = ltp.ner(hidden)
print("ner", ner)
srl = ltp.srl(hidden)
dep = ltp.dep(hidden)
sdp = ltp.sdp(hidden)

print(ner, "ner结果")
seg = seg[0]
dep = dep[0]
sdp = sdp[0]
print(sdp, "语义分析!!!!!!!!!!!!!!!!!!!")  # 太难用了.
print(dep)
for i in dep:  #dep算法目前识别不出来老婆的跳转.

    print(i, seg[i[0] - 1], seg[i[1] - 1])  # 注意下标会多一个, 箭1后为真正下标.
'''
下面我们根据跳跃图简历bfs算法
'''

#dep 就是我们需要的图
Пример #5
0
tmp3 = []
for i in kglist:
    t = (cosine_distance(vec2(i), vec2(text)))
    tmp3.append(t)
tmp3 = np.array(tmp3)

# 查询到的最近kg 3元组是!!!!!!!!!!!!!!!!
dix = np.argmin(tmp3)
print('最近的3元组是', kglist[dix])

print(33333333333333)

##

seg, hidden = ltp.seg([text])
sdp = ltp.sdp(hidden, graph=False)

print(seg, "seg")
pos = ltp.pos(hidden)
ner = ltp.ner(hidden)
print("ner", ner)
srl = ltp.srl(hidden)
dep = ltp.dep(hidden)
sdp = ltp.sdp(hidden)

seg = seg[0]
for i in sdp[0]:

    print(i, seg[i[0] - 1], seg[i[1] - 1])  # 注意下标会多一个, 箭1后为真正下标.
Пример #6
0
class Run(object):
    def __init__(self,
                 path: str = 'small',
                 batch_size: int = 50,
                 device: str = None,
                 onnx: bool = False):
        if onnx:
            self.ltp = FastLTP(path=path, device=device, need_config=True)
        else:
            self.ltp = LTP(path=path, device=device, need_config=True)
        self.split = lambda a: map(lambda b: a[b:b + batch_size],
                                   range(0, len(a), batch_size))

    def _build_words(self, words, pos, dep):
        res = [{'id': -1, 'length': 0, 'offset': 0, 'text': 'root'}]
        for word, p, (id, parent, relation) in zip(words, pos, dep):
            offset = res[-1]['offset'] + res[-1]['length']
            res.append({
                'id': id - 1,
                'length': len(word),
                'offset': offset,
                'text': word,
                'pos': p,
                'parent': parent - 1,
                'relation': relation,
                'roles': [],
                'parents': []
            })

        return res[1:]

    def _predict(self, sentences: List[str]):
        result = []
        for sentences_batch in self.split(sentences):
            batch_seg, hidden = self.ltp.seg(sentences_batch)
            batch_pos = self.ltp.pos(hidden)
            batch_ner = self.ltp.ner(hidden)
            batch_srl = self.ltp.srl(hidden)
            batch_dep = self.ltp.dep(hidden)
            batch_sdp = self.ltp.sdp(hidden)

            for sent, seg, pos, ner, srl, dep, sdp in \
                    zip(sentences_batch, batch_seg, batch_pos, batch_ner, batch_srl, batch_dep, batch_sdp):

                words = self._build_words(seg, pos, dep)

                for word, token_srl in zip(words, srl):
                    for role, start, end in token_srl:
                        text = "".join(seg[start:end + 1])
                        offset = words[start]['offset']
                        word['roles'].append({
                            'text': text,
                            'offset': offset,
                            'length': len(text),
                            'type': role
                        })

                for start, end, label in sdp:
                    words[start - 1]['parents'].append({
                        'parent': end - 1,
                        'relate': label
                    })

                nes = []
                for role, start, end in ner:
                    text = "".join(seg[start:end + 1])
                    nes.append({
                        'text': text,
                        'offset': start,
                        'ne': role.lower(),
                        'length': len(text)
                    })

                result.append({'text': sent, 'nes': nes, 'words': words})

        return result

    def test(self, sentences: List[str] = None):
        self.ltp.add_words("DMI与主机通讯中断")
        if sentences is None:
            sentences = ["他叫汤姆去拿外衣。"]
        res = self._predict([sentence.strip() for sentence in sentences])
        print(json.dumps(res, indent=2, sort_keys=True, ensure_ascii=False))

    def save(self, out='ltp.npz'):
        import numpy as np
        nps = {}
        for k, v in self.ltp.model.state_dict().items():
            k = k.replace("gamma", "weight").replace("beta", "bias")
            nps[k] = np.ascontiguousarray(v.cpu().numpy())

        np.savez(out, **nps)

        config = self.ltp.config
        with open('config.json', 'w', encoding='utf-8') as f:
            json.dump(config, f, indent=2)

    def test_seged(self):
        import torch
        sentences = [
            'My name is tom.', 'He called Tom to get coats.', '他叫Tom去拿外衣。',
            '他叫汤姆去拿外衣。', "我去长江大桥玩。"
        ]
        seg, hidden = self.ltp.seg(sentences)
        seged, hidden_seged = self.ltp.seg(seg, is_preseged=True)
        hidden: dict
        hidden_seged: dict
        for key, value in hidden.items():
            if isinstance(value, torch.Tensor):
                test = torch.sum(value.float() -
                                 hidden_seged[key].float()).numpy()
                print(key, test)

        print(seg == seged)
Пример #7
0
class Server(object):
    def __init__(self,
                 path: str = 'small',
                 batch_size: int = 50,
                 device: str = None,
                 onnx: bool = False):
        if onnx:
            self.ltp = FastLTP(path=path, device=device)
        else:
            self.ltp = LTP(path=path, device=device)
        self.split = lambda a: map(lambda b: a[b:b + batch_size],
                                   range(0, len(a), batch_size))

    def _build_words(self, words, pos, dep):
        res = [{'id': -1, 'length': 0, 'offset': 0, 'text': 'root'}]
        for word, p, (id, parent, relation) in zip(words, pos, dep):
            offset = res[-1]['offset'] + res[-1]['length']
            res.append({
                'id': id - 1,
                'length': len(word),
                'offset': offset,
                'text': word,
                'pos': p,
                'parent': parent - 1,
                'relation': relation,
                'roles': [],
                'parents': []
            })

        return res[1:]

    def _predict(self, sentences: List[str]):
        result = []
        for sentences_batch in self.split(sentences):
            batch_seg, hidden = self.ltp.seg(sentences_batch)
            batch_pos = self.ltp.pos(hidden)
            batch_ner = self.ltp.ner(hidden)
            batch_srl = self.ltp.srl(hidden)
            batch_dep = self.ltp.dep(hidden)
            batch_sdp = self.ltp.sdp(hidden)

            for sent, seg, pos, ner, srl, dep, sdp in \
                    zip(sentences_batch, batch_seg, batch_pos, batch_ner, batch_srl, batch_dep, batch_sdp):

                words = self._build_words(seg, pos, dep)

                for word, token_srl in zip(words, srl):
                    for role, start, end in token_srl:
                        text = "".join(seg[start:end + 1])
                        offset = words[start]['offset']
                        word['roles'].append({
                            'text': text,
                            'offset': offset,
                            'length': len(text),
                            'type': role
                        })

                for start, end, label in sdp:
                    words[start - 1]['parents'].append({
                        'parent': end - 1,
                        'relate': label
                    })

                nes = []
                for role, start, end in ner:
                    text = "".join(seg[start:end + 1])
                    nes.append({
                        'text': text,
                        'offset': start,
                        'ne': role.lower(),
                        'length': len(text)
                    })

                result.append({'text': sent, 'nes': nes, 'words': words})

        return result
Пример #8
0
#初始化分词工具
ltp = LTP(proxies=proxies)

sentences = pdfreader.getTestFromPdf()['Text']
seg = []
sdp = []
dep = []
pos = []
cluster = []

#对句子进行分词以及语义依存分析
for st in sentences:
    if (st != ''):
        seg_temp, hidden = ltp.seg([st])
        #  获得语义依存关系
        sdp.append(ltp.sdp(hidden)[0])
        #获得词性列表
        pos.append(ltp.pos(hidden)[0])
        #获得分词列表
        seg.append(seg_temp[0])
        #获得语法依存关系
        dep.append(ltp.dep(hidden)[0])
    #初始化储存三元组的list
    resultTriad = []

for index in range(len(dep)):
    r = getTriad(dep[index], seg[index], pos[index])
    resultTriad.append(r)

#初始化数据库连接
graph = getConnection()
Пример #9
0
class Conllu(object):
    """
    :param path: 模型路径,或者自动从网上下载 ['base', 'small', 'tiny']
    :param batch_size: 最大 Batch Size 自动切分
    :param device: ['cpu', 'cuda']
    :param onnx: 是否启用 onnx
    """
    def __init__(self,
                 path: str = 'small',
                 batch_size: int = 50,
                 device: str = None,
                 onnx: bool = False):

        if onnx:
            self.ltp = FastLTP(path=path, device=device, need_config=True)
        else:
            self.ltp = LTP(path=path, device=device, need_config=True)
        self._split = lambda a: map(lambda b: a[b:b + batch_size],
                                    range(0, len(a), batch_size))

    def _predict(self,
                 sentences: List[str],
                 pos=True,
                 ner=True,
                 srl=True,
                 dep=True,
                 sdp=True):
        result = []
        for sentences_batch in self._split(sentences):
            batch_seg, hidden = self.ltp.seg(sentences_batch)

            batch_size = len(sentences_batch)
            batch_pos = self.ltp.pos(hidden) if pos else ([[]] * batch_size)
            batch_ner = self.ltp.ner(hidden) if ner else ([None] * batch_size)
            batch_srl = self.ltp.srl(
                hidden, keep_empty=False) if srl else ([None] * batch_size)
            batch_dep = self.ltp.dep(hidden) if dep else ([None] * batch_size)
            batch_sdp = self.ltp.sdp(hidden) if sdp else ([None] * batch_size)

            result += list(
                zip(batch_seg, batch_pos, batch_ner, batch_dep, batch_sdp,
                    batch_srl))

        return result

    def predict(self,
                input: str,
                output: Optional[str] = None,
                pos: bool = True,
                ner: bool = False,
                srl: bool = False,
                dep: bool = True,
                sdp: bool = False):
        """
        预测文本并输出为 conllu 格式
        :param input: 要预测的文件,每行一句话
        :param output: 输出的结果文件,默认是输入文件添加 .conll 后缀
        :param pos: 是否输出 词性标注 结果 ['True','False']
        :param ner: 是否输出 命名实体识别 结果 ['True','False'], 占用 conllu feats 列
        :param srl: 是否输出 语义角色标注 结果 ['True','False'], 占用 conllu misc 列
        :param dep: 是否输出 依存句法分析 结果 ['True','False']
        :param sdp: 是否输出 语义依存分析 结果 ['True','False']
        """
        if output is None:
            output = f"{input}.conllu"

        with open(output, mode='w', encoding='utf-8') as f:
            sentences = sum([sent for idx, sent in iter_lines(input)], [])
            results = self._predict(sentences, pos, ner, srl, dep, sdp)

            for text, (seg_s, pos_s, ner_s, dep_s, sdp_s,
                       srl_s) in zip(sentences, results):
                tokens = conllu.TokenList([
                    conllu.models.Token(id=idx + 1,
                                        form=token,
                                        lemma=token,
                                        upos=pos if pos else '_',
                                        xpos=pos if pos else '_',
                                        feats='O' if ner else '_',
                                        head=idx,
                                        deprel='_',
                                        deps='' if sdp else '_',
                                        misc='SpaceAfter=No')
                    for idx, (token,
                              pos) in enumerate(zip_longest(seg_s, pos_s))
                ], conllu.models.Metadata(text=text))

                if ner:
                    for tag, start, end in ner_s:
                        tokens[start]['feats'] = f'B-{tag}'
                        for i in range(start + 1, end):
                            tokens[start]['feats'] = f'I-{tag}'
                if dep:
                    for id, head, tag in dep_s:
                        tokens[id - 1]['head'] = head
                        tokens[id - 1]['deprel'] = tag
                if sdp:
                    for id, head, tag in sdp_s:
                        if tokens[id - 1]['deps']:
                            tokens[id - 1]['deps'] = tokens[
                                id - 1]['deps'] + f"|{head}:{tag}"
                        else:
                            tokens[id - 1]['deps'] = f"{head}:{tag}"

                if srl:
                    srl_predicate, srl_roles = list(zip(*srl_s))
                    srl_predicate_num = len(srl_predicate)
                    if srl_predicate_num > 0:
                        srl_misc = [[
                            f'Predicate={"Y" if i in srl_predicate else "_"}',
                            ['O'] * srl_predicate_num
                        ] for i in range(len(tokens))]
                        for idx, srl_role in enumerate(srl_roles):
                            for tag, start, end in srl_role:
                                srl_misc[start][-1][idx] = f'B-{tag}'
                                for i in range(start + 1, end):
                                    srl_misc[start][-1][idx] = f'I-{tag}'
                        srl_misc = [
                            "|".join([s[0], "Role=" + ",".join(s[-1])])
                            for s in srl_misc
                        ]

                        for token, misc in zip(tokens, srl_misc):
                            token['misc'] = f"{token['misc']}|{misc}"

                f.write(tokens.serialize())