Exemplo n.º 1
0
    def get_pos_tag(self, sentence):
        r"""
        pos tag function.

        :param str sentence: the sentence need to be ner
        :return: the triple form (tags,start,end)
        """

        assert isinstance(sentence, (list, str))
        from ltp import LTP
        if isinstance(sentence, list):
            # Turn the list into sentence
            tmp = ''
            for word in sentence:
                tmp += word
            sentence = tmp

        if not sentence:
            return []

        if self.__pos is None:
            # get pos tag
            self.__pos = LTP()
        seg, hidden = self.__pos.seg([sentence])
        pos = self.__pos.pos(hidden)
        seg = seg[0]
        pos = pos[0]
        pos_tag = []
        cnt = 0
        for tag in range(len(pos)):
            pos_tag.append([pos[tag], cnt, cnt + len(seg[tag]) - 1])
            cnt += len(seg[tag])

        return pos_tag
Exemplo n.º 2
0
class Ner:
    def __init__(self):
        self.ltp = LTP()
    
    def preprocess(self, sent):
        return re.sub('\s+', '', sent)

    def ner(self, sents):
        assert not any(re.search(r'\s', x) for x in sents), "no space is allowed"
        psents = [x for x in sents if x != '']
        if len(psents) == 0:
            return [[] for x in sents]
        segment, hidden = self.ltp.seg(psents)
        ne = self.ltp.ner(hidden)
        anes = []
        for sseg, sne in zip(segment, ne):
            nes = []
            slens = [0] + [len(x) for x in sseg]
            for i in range(1, len(slens)):
                slens[i] += slens[i - 1]
            for t, x, y in sne:
                if t == 'Ns':
                    nes.append([slens[x], slens[y + 1]])
            anes.append(nes)
        fnes = []
        cur = 0
        for s in sents:
            if s == '':
                fnes.append([])
            else:
                fnes.append(anes[cur])
                cur += 1
        return fnes
Exemplo n.º 3
0
def seg_with_ltp40(in_file, out_file_path, manual_seg_file):
    # initialization model
    ltp = LTP()
    line_list = []

    # save seg_result
    corpus = construct_corpus(in_file)
    f = open(out_file_path, "w", encoding='utf-8')
    for line in corpus:
        line_list.append(line)  # 将每句话变成列表["Xxxx"]
        seg_result, hidden = ltp.seg(line_list)
        f.write("=".join(seg_result[0]) + "\n")
        line_list.clear()
        f.flush()

    # test qps
    corpus = construct_corpus(in_file, 1)
    start = time.time()
    for line in corpus:
        segment, hidden = ltp.seg(list(line))
    end = time.time()
    qps = round(len(corpus) / (end - start), 2)

    # test accuracy
    p, r, f1, line_aver_length = evaluate(out_file_path, manual_seg_file)
    return qps, p, r, f1, line_aver_length
Exemplo n.º 4
0
 def __init__(self,
              path: str = 'small',
              batch_size: int = 50,
              device: str = None,
              onnx: bool = False):
     self.ltp = LTP(path=path, device=device)
     self.split = lambda a: map(lambda b: a[b:b + batch_size],
                                range(0, len(a), batch_size))
Exemplo n.º 5
0
def ltp_func(text_list):
    ltp = LTP()
    seg, hidden = ltp.seg(text_list)
    pos = ltp.pos(hidden)
    result = []
    for idx, val in enumerate(seg[0]):
        pag = [val, pos[0][idx]]
        result.append('/'.join(pag))
    return result
Exemplo n.º 6
0
 def dependency(self):
     sentence = self.sentence
     sentences = []
     sentences.append(sentence)
     ltp = LTP()
     seg, hidden = ltp.seg(sentences)
     dep = ltp.dep(hidden)
     print(seg)
     print(dep)
     pass
Exemplo n.º 7
0
 def __init__(self,
              path: str = 'small',
              batch_size: int = 10,
              device: str = None,
              onnx: str = None,
              vocab: str = None):
     self.ltp = LTP(path=path,
                    batch_size=batch_size,
                    device=device,
                    vocab=vocab)
Exemplo n.º 8
0
 def __init__(self,
              default_model_dir=LTP4_MODEL_DIR,
              user_dict_dir=USER_DICT_DIR):
     self.ltp = LTP(path=default_model_dir)
     for file in os.listdir(user_dict_dir):
         self.ltp.init_dict(path=os.path.join(user_dict_dir, file))
     self.sentences = []
     self.postags = []
     self.nertags = []
     self.dep = []
Exemplo n.º 9
0
 def __init__(self,
              path: str = 'small',
              batch_size: int = 50,
              device: str = None,
              onnx: bool = False):
     if onnx:
         self.ltp = FastLTP(path=path, device=device, need_config=True)
     else:
         self.ltp = LTP(path=path, device=device, need_config=True)
     self._split = lambda a: map(lambda b: a[b:b + batch_size],
                                 range(0, len(a), batch_size))
Exemplo n.º 10
0
def work_summary_parser_ltp():
    f = csvReader("标准工作任务单")
    ltp = LTP()
    paList = []
    for i, row in enumerate(f):
        if i != 0:
            val = row[1][5:].split(',')
            paList.append(val[2])
    wa, ha = ltp.seg(paList)
    pa = ltp.pos(ha)
    return wa, pa
Exemplo n.º 11
0
 def __init__(self, model_type='base', user_dict_dir=RESOURCE_DIR):
     self.default_user_dict_dir = user_dict_dir
     # 加载ltp模型
     self.ltp = LTP(model_type)
     # 添加用户词典(法律文书大辞典与清华大学法律词典),这种方式是添加进内存中,速度更快
     files = os.listdir(user_dict_dir)
     for file in files:
         file_path = os.path.join(user_dict_dir, file)
         # 文件夹则跳过
         if os.path.isdir(file):
             continue
         self.ltp.init_dict(file_path)
Exemplo n.º 12
0
def mongo2ner(idx, ltp, offset, size):
    """
    根据offset从mongo中取指定size的文章
    :param idx:
    :param offset:
    :param size:
    :return:
    """
    entities = []
    pid = os.getpid()
    try:
        # debug_logger.debug("{} ---pid:{} MongoDB: Skip: {}, size: {}".format(idx, pid, offset, size))
        ltp = LTP(path=LTP4_MODEL_DIR)
        db_connect = MongoClient(host=MONGODB_HOST, port=MONGODB_PORT)
        db = db_connect[MONGODB_DATABASE_NAME]
        coll = db[MONGODB_ENTMT_COLLECTION]
        # debug_logger.debug("pid: {}, connected".format(pid))
        for art in coll.find(skip=offset, limit=size):
            debug_logger.debug(art['title'])
            text = art['title'] + art['content']
            entities_of_art = get_article_entities(idx, text, ltp)
            entities += entities_of_art

        # debug_logger.debug("pid: {}, write".format(pid))
        with open(os.path.join(USER_DICT_DIR, 'ners_' + str(idx) + '.txt'), 'w') as fw:
            for item in entities:
                for word, label in item:
                    fw.write(word + '\t' + label + '\n')
    except Exception as e:
        print("ERROR mongo2ner: {}".format(e))
        # debug_logger.debug("ERROR mongo2ner: {}".format(e))
    return entities
Exemplo n.º 13
0
 def findFood(self, sentence):
     ltp = LTP()
     words, hidden = ltp.seg([sentence])
     posTags = ltp.pos(hidden)
     words = words[0]  #分词结果list
     posTags = posTags[0]  #词性标注结果list
     dep = ltp.dep(hidden)[0]  #依存句法分析结果list
     relyId = [d[1] for d in dep]  #父节点id list
     relation = [d[2] for d in dep]  #关系结果 list
     heads = ['Root' if id == 0 else words[id - 1] for id in relyId]  #父节点内容
     string = ''
     for i in range(len(words)):
         if 'n' in posTags[i] and heads[i] == '吃' and relation[i] == 'VOB':
             string += words[i]
             string += ' '
     return string
Exemplo n.º 14
0
def WriteTest(readfile, savefile):
    with open(readfile, "r", encoding="utf-8") as rfp:
        ltp = LTP()
        logger.info("Processing file:%s ." % (readfile))
        with open(savefile, 'w', encoding='utf-8') as wfp:

            for row in tqdm(rfp, desc="file %s process" % (readfile)):
                sent1, sent2 = row.split('\t')
                seg, hid = ltp.seg([sent1, sent2])
                sdp = ltp.sdp(hid, mode='tree')
                pos = ltp.pos(hid)
                tmpitem = {
                    'sentence1': [seg[0], pos[0], sdp[0]],
                    'sentence2': [seg[1], pos[1], sdp[1]]
                }
                jsonline = json.dumps(tmpitem)
                wfp.write(jsonline + "\n")
Exemplo n.º 15
0
    def get_ner(self, sentence):
        r"""
        NER function.

        :param str sent: the sentence need to be ner
        :return two forms of tags
            The first is the triple form (tags,start,end)
            The second is the list form, which marks the ner label of each word
            such as 周小明去玩
            ['Nh', 'Nh', 'Nh', 'O', 'O']
        """
        assert isinstance(sentence, (list, str))
        from ltp import LTP
        if isinstance(sentence, list):
            # Turn the list into sentence
            tmp = ''
            for word in sentence:
                tmp += word
            sentence = tmp

        if not sentence:
            return [], []

        if self.__ner is None:
            self.__ner = LTP()
        seg, hidden = self.__ner.seg([sentence])
        seg = seg[0]
        ner = self.__ner.ner(hidden)
        ner = ner[0]

        ner_label = len(sentence) * ['O']
        for i in range(len(ner)):
            tag, start, end = ner[i]
            tmp = 0
            for j in range(start):
                tmp += len(seg[j])
            start = tmp
            tmp = 0
            for j in range(end + 1):
                tmp += len(seg[j])
            end = tmp
            ner[i] = (tag, start, end - 1)
            for j in range(start, end):
                ner_label[j] = tag

        return ner, ner_label
Exemplo n.º 16
0
def findFood(sentence):
    ltp = LTP()
    words, hidden = ltp.seg([sentence])
    posTags = ltp.pos(hidden)
    words = words[0]  #分词结果list
    print(words)
    posTags = posTags[0]  #词性标注结果list
    print(posTags)
    dep = ltp.dep(hidden)[0]  #依存句法分析结果list
    for t in dep:
        print(t)
    relyId = [d[1] for d in dep]  #父节点id list
    relation = [d[2] for d in dep]  #关系结果 list
    heads = ['Root' if id == 0 else words[id - 1] for id in relyId]  #父节点内容
    for i in range(len(words)):
        if 'n' in posTags[i] and heads[i] == '吃' and relation[i] == 'VOB':
            print("找到了一种食物:" + words[i])
Exemplo n.º 17
0
class NamedEntity:
    def __init__(self, user_dict):
        self.ltp = LTP()  # 默认加载Small模型
        # user_dict.txt 是词典文件, max_window是最大前向分词窗口
        self.ltp.init_dict(path=user_dict, max_window=4)

    def entity_recognition(self, text: list):
        """
        命名实体识别
        :param text: 原始文本
        :return: 从原始文本中抽取的命名实体
        """
        seg, hidden = self.ltp.seg(text)   # 分词
        ner = self.ltp.ner(hidden)
        entity = []
        for tag, start, end in ner[0]:
            entity.append(seg[0][start:end+1][0])
        return entity
Exemplo n.º 18
0
    def __init__(self, seq_len=512):
        """
        Constructs Huggingface CN tokenizer & other
            col: What column to tokenize if pretraining
        """

        self.tokenizer_cn = AutoTokenizer.from_pretrained("bert-base-chinese")
        self.tokenizer_ltp = LTP("small")
        self.max_seq_length = seq_len
Exemplo n.º 19
0
    def is_word(sentence):
        from ltp import LTP
        r""" 
        Judge whether it is a word.

        :param str sentence: input sentence string
            sentence: input sentence string
        :return bool: is a word or not
        
        """
        if sentence[0] == sentence[1]:
            return True
        ltp = LTP()
        seg, hidden = ltp.seg([sentence])
        pos = ltp.pos(hidden)
        pos = pos[0]
        if len(pos) == 1 and pos[0] == 'n':
            return False
        return True
Exemplo n.º 20
0
def work_detail_parser_ltp():
    f = csvReader("标准工作任务单")
    ltp = LTP()
    paList = []
    pbList = []
    for i, row in enumerate(f):
        if i != 0:
            val = row[1][5:].split(',')
            paList.append(val[2])
            temp = val[3:]
            for v in temp:
                pbList.append(v)
    # print(paList)
    # print(pbList)
    sa, ha = ltp.seg(paList)
    sb, hb = ltp.seg(pbList)
    pa = ltp.pos(ha)
    pb = ltp.pos(hb)

    return sa, sb, pa, pb
Exemplo n.º 21
0
 def test_nlp_model(self):
     ltp1 = LTP(LTP4_MODEL_DIR)
     ltp2 = LTP(LTP4_MODEL_DIR)
     ltp3 = LTP(LTP4_MODEL_DIR)
     ltp4 = LTP(LTP4_MODEL_DIR)
     ltp5 = LTP(LTP4_MODEL_DIR)
     ltp6 = LTP(LTP4_MODEL_DIR)
     ltp7 = LTP(LTP4_MODEL_DIR)
     print('-------')
     import time
     time.sleep(10)
Exemplo n.º 22
0
def create():
    """create profession keywords json file.
    """
    ltp = LTP()  # 默认加载 Small 模型
    # import the professions file
    with open('./dataset/profession.json', 'rb') as jsonfile:
        profession_json = json.load(jsonfile, encoding='utf-8')

    for i, profession in enumerate(profession_json['data']):
        profession_json['data'][i]['kwords'] = find_kwords_by_ltp(
            profession['name'], ltp)

    with open('./dataset/profession2.json', 'w', encoding='utf-8') as jsonfile:
        json.dump(profession_json, jsonfile, ensure_ascii=False)
Exemplo n.º 23
0
def main(args):
    # For Chinese (Ro)Bert, the best result is from : RoBERTa-wwm-ext (https://github.com/ymcui/Chinese-BERT-wwm)
    # If we want to fine-tune these model, we have to use same tokenizer : LTP (https://github.com/HIT-SCIR/ltp)
    with open(args.file_name, "r", encoding="utf-8") as f:
        data = f.readlines()

    ltp_tokenizer = LTP(args.ltp)  # faster in GPU device
    bert_tokenizer = BertTokenizer.from_pretrained(args.bert)

    ref_ids = prepare_ref(data, ltp_tokenizer, bert_tokenizer)

    with open(args.save_path, "w", encoding="utf-8") as f:
        data = [json.dumps(ref) + "\n" for ref in ref_ids]
        f.writelines(data)
Exemplo n.º 24
0
def new_generate_ltp_results():
    # 加载模型
    ltp_model = '../../ltp_models/base1'
    ltp = LTP(path=ltp_model)

    # 读取原句子
    data = read_file_in_ltp('../data/train_base.json')
    sentences = list(map(lambda x: x['content'], data))

    segmented, pos, ner, srl, dep, sdp_tree, sdp_graph = [], [], [], [], [], [], []
    for sent in tqdm(sentences):
        # 分词
        segmented0, hidden = ltp.seg([sent])
        # 词性标注
        cur_pos = ltp.pos(hidden)
        # 命名实体识别
        cur_ner = ltp.ner(hidden)
        # 语义角色标注
        cur_srl = ltp.srl(hidden)
        # 依存句法分析
        cur_dep = ltp.dep(hidden)
        # 语义依存分析 (树)
        cur_sdp_tree = ltp.sdp(hidden, mode='tree')
        # 语义依存分析 (图)
        cur_sdp_graph = ltp.sdp(hidden, mode='graph')

        segmented.append(segmented0[0])
        pos.append(cur_pos[0])
        ner.append(cur_ner[0])
        srl.append(cur_srl[0])
        dep.append(cur_dep[0])
        sdp_tree.append(cur_sdp_tree[0])
        sdp_graph.append(cur_sdp_graph[0])

        # 生成句子与分词的对应
    sent_seg_matches = sentence_segment_match(data, segmented)
    pickle.dump([segmented, pos, ner, srl, dep, sdp_tree, sdp_graph, sent_seg_matches], open('new_ltp_results.pk', 'wb'))

    return segmented, pos, ner, srl, dep, sdp_tree, sdp_graph, sent_seg_matches
Exemplo n.º 25
0
def main(args):
    # For Chinese (Ro)Bert, the best result is from : RoBERTa-wwm-ext (https://github.com/ymcui/Chinese-BERT-wwm)
    # 如果要微调这些模型,则必须使用相同的tokenizer  : LTP (https://github.com/HIT-SCIR/ltp)
    with open(args.file_name, "r", encoding="utf-8", errors='ignore') as f:
        data = f.readlines()
    print(f'开始处理数据,共有{len(data)}条')
    data = [
        line.strip() for line in data if len(line) > 0 and not line.isspace()
    ]  # avoid delimiter like '\u2029'
    print(f"开始加载ltp和bert的tokenizer模型")
    ltp_tokenizer = LTP(path=args.ltp)  # faster in GPU device
    bert_tokenizer = BertTokenizer.from_pretrained(args.bert)
    #准备映射关系
    ref_ids = prepare_ref(data, ltp_tokenizer, bert_tokenizer)
    #保存映射关系
    with open(args.save_path, "w", encoding="utf-8") as f:
        data = [json.dumps(ref) + "\n" for ref in ref_ids]
        f.writelines(data)
    print(f"保存所有{len(data)}条数据的映射关系到文件{args.save_path}")
Exemplo n.º 26
0
def thread_main(args, gpu=True):
    """
    多线程处理
    Args:
        args:
        gpu: 是否使用gpu
    Returns:

    """
    from functools import partial
    from multiprocessing import Pool
    from tqdm import tqdm
    # For Chinese (Ro)Bert, the best result is from : RoBERTa-wwm-ext (https://github.com/ymcui/Chinese-BERT-wwm)
    # 如果要微调这些模型,则必须使用相同的tokenizer  : LTP (https://github.com/HIT-SCIR/ltp)
    with open(args.file_name, "r", encoding="utf-8") as f:
        data = f.readlines()
    print(f'开始处理数据,共有{len(data)}条')
    data = [
        line.strip() for line in data if len(line) > 0 and not line.isspace()
    ]  # avoid delimiter like '\u2029'
    print(f"开始加载ltp和bert的tokenizer模型")
    ltp_tokenizer = LTP(path=args.ltp)  # faster in GPU device
    bert_tokenizer = BertTokenizer.from_pretrained(args.bert)
    newdata = [data[i:i + 1000] for i in range(0, len(data), 1000)]
    #准备映射关系, 并行线程数
    #如果使用GPU,请设置如下
    if gpu:
        import torch
        torch.multiprocessing.set_start_method('spawn')
    with Pool(processes=args.processes) as p:
        # partial_clean 是封装一下函数
        partial_clean = partial(prepare_ref,
                                ltp_tokenizer=ltp_tokenizer,
                                bert_tokenizer=bert_tokenizer)
        # chunksize8,就是数据分成8份
        ref_ids_nest = list(
            tqdm(p.imap(partial_clean, newdata, chunksize=8), desc="开始处理数据"))
    ref_ids = [ref for nest in ref_ids_nest for ref in nest]
    #保存映射关系
    with open(args.save_path, "w", encoding="utf-8") as f:
        data = [json.dumps(ref) + "\n" for ref in ref_ids]
        f.writelines(data)
    print(f"保存所有{len(data)}条数据的映射关系到文件{args.save_path}")
Exemplo n.º 27
0
def load_word_segmentation_tool():
    """
    加载分词工具
    :return: HanLP: hanlp, ltp: LTP
    """
    logger.info("loading word segmentation tool")
    # HanLP = HanLPClient(url='https://www.hanlp.com/api', auth='MTE4QGJicy5oYW5scC5jb206MXFFOHhWUkJNQXBNdlh0NA==')
    HanLP = hanlp.load(hanlp.pretrained.mtl.
                       CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH,
                       verbose=True)
    tasks = list(HanLP.tasks.keys())
    for task in tasks:
        if task not in TASK:
            del HanLP[task]
    tok = HanLP[TASK[0]]
    tok.dict_combine = {'新冠', '新冠病毒', '新冠肺炎'}
    ltp = LTP()
    logger.info("loaded word segmentation tool")
    return HanLP, ltp
Exemplo n.º 28
0
def prepare_ref(lines: List[str], ltp_tokenizer: LTP,
                bert_tokenizer: BertTokenizer):
    ltp_res = []

    for i in range(0, len(lines), 100):
        res = ltp_tokenizer.seg(lines[i:i + 100])[0]
        res = [get_chinese_word(r) for r in res]
        ltp_res.extend(res)
    assert len(ltp_res) == len(lines)

    bert_res = []
    for i in range(0, len(lines), 100):
        res = bert_tokenizer(lines[i:i + 100],
                             add_special_tokens=True,
                             truncation=True,
                             max_length=512)
        bert_res.extend(res["input_ids"])
    assert len(bert_res) == len(lines)

    ref_ids = []
    for input_ids, chinese_word in zip(bert_res, ltp_res):

        input_tokens = []
        for id in input_ids:
            token = bert_tokenizer._convert_id_to_token(id)
            input_tokens.append(token)
        input_tokens = add_sub_symbol(input_tokens, chinese_word)
        ref_id = []
        # We only save pos of chinese subwords start with ##, which mean is part of a whole word.
        for i, token in enumerate(input_tokens):
            if token[:2] == "##":
                clean_token = token[2:]
                # save chinese tokens' pos
                if len(clean_token) == 1 and _is_chinese_char(
                        ord(clean_token)):
                    ref_id.append(i)
        ref_ids.append(ref_id)

    assert len(ref_ids) == len(bert_res)

    return ref_ids
Exemplo n.º 29
0
    def load_ltp_weights(weights_type):
        '''
        加载 LTP 权重文件,实例化 LTP 模型
        :param weights_type: 载入模型文件类型,只能采用 base、small、tiny 三种类型
        :return: 载入权重参数后的 LTP 模型
        '''
        # 诊断模型类型
        assert weights_type in ['base', 'small',
                                'tiny'], 'LTP 模型只能采用 base、small、tiny三种类型的参数'

        # 确认文件路径
        if LtpModelPath is None:
            file_path = os.path.abspath(
                os.path.join(os.path.dirname('.'), 'weights', weights_type))
        else:
            file_path = os.path.abspath(
                os.path.join(LtpModelPath, weights_type))

        # 载入权重
        ltp = LTP(path=file_path)

        return ltp
Exemplo n.º 30
0
# [[('every', 5)], [('自然数', 'x'), 'and', ('奇数', 'x')]]
from ltp import LTP
ltp = LTP()


class NlpCtr(object):
    def __init__(self):
        self.seg = None
        self.words = None
        self.dep = None

    def trans_result(self, depArr, posArr):
        tempposArr = posArr[0]
        tempdepArr = depArr[0]

        tempArr = []
        for item in tempdepArr:
            dic = {
                'dep': item[0],
                'gov': item[1],
                'type': item[2],
                # 'pos': tempposArr[item[0] - 1]
            }
            tempArr.append(dic)
        return tempArr

    def getHED(self, words):
        root = None
        for word in words:
            if word['gov'] == 0 and word['type'] == 'HED':
                root = word['dep']