Пример #1
0
def ner_train_data():
    ent2mention = json_load(Config.ent2mention_json)
    # recognizer = hanlp.load(hanlp.pretrained.ner.MSRA_NER_BERT_BASE_ZH)
    tokenizer = hanlp.load('PKU_NAME_MERGED_SIX_MONTHS_CONVSEG')
    recognizer = hanlp.load(hanlp.pretrained.ner.MSRA_NER_BERT_BASE_ZH)
    from LAC import LAC
    # 装载LAC模型
    lac = LAC(mode='lac')
    jieba.enable_paddle()  # 启动paddle模式。 0.40版之后开始支持,早期版本不支持
    _ent_patten = re.compile(r'["<](.*?)[>"]')
    for q, sparql, a in load_data():
        q_text = question_patten.findall(q)[0]
        hanlp_entities = recognizer([list(q_text)])
        hanlp_words = tokenizer(q_text)
        lac_results = lac.run(q_text)
        q_entities = _ent_patten.findall(sparql)
        jieba_results = list(jieba.cut_for_search(q_text))
        mentions = [ent2mention.get(ent) for ent in q_entities]
        print(f"q_text: {q_text}\nq_entities: {q_entities}, "
              f"\nlac_results:{lac_results}"
              f"\nhanlp_words: {hanlp_words}, "
              f"\njieba_results: {jieba_results}, "
              f"\nhanlp_entities: {hanlp_entities}, "
              f"\nmentions: {mentions}")
        import ipdb
        ipdb.set_trace()
Пример #2
0
def rc(sentence):
    tokenizer_result = tokenizer(sentence)

    tagger = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ALBERT_BASE)
    tagger_result = tagger(tokenizer_result)

    token_and_tagger_dic = []
    for i in range(len(tokenizer_result)):
        token_and_tagger_dic.append((tokenizer_result[i], tagger_result[i]))

    syntactic_parser = hanlp.load(hanlp.pretrained.dep.CTB7_BIAFFINE_DEP_ZH)
    sparser_result = syntactic_parser(token_and_tagger_dic)

    for i in range(len(sparser_result)):
        word = sparser_result[i]
        word = str(word)
        print(word.split('	'))
        word_dic = word.split('	')
        if (word_dic[7] == 'root'):
            print('核心是:' + word_dic[1])
            # 找找有没有谓语补语
            dong_bu = []
            for j in range(len(sparser_result)):
                word2 = str(sparser_result[j]).split('	')
                if (word_dic[0] == word2[6] and word2[7] != 'punct'
                        and word2[7] != ' '):
                    dong_bu.append(word2[1])
                    if (word2[7] == 'attr' and word2[7] != ' '):
                        print('属性关系是:', word2[1])
                    if (word2[7] == 'cop'):
                        print('核心词修正是:', word2[1] + word_dic[1])
            print(dong_bu)
Пример #3
0
def test():
    # for days in range(1,0,-1):
    #     target_date = datetime.now().date() + timedelta(days=-days)
    #     with open("./predict/{}.json".format(target_date), "r", encoding="utf-8") as f:
    #         old_data = json.load(f)
    #         with open("./predict/top/{}.json".format(target_date), "r", encoding="utf-8") as f1:
    #             new_data = json.load(f1)
    #             for nd in new_data:
    #                 for od in old_data:
    #                     if nd['topic'] == od ['topic']:
    #                         nd['kw'] = od ['kw']
    #             with open("./predict/top/{}.json".format(target_date), "w", encoding="utf-8") as f2:
    #                 json.dump(new_data, f2)
    HanLP = hanlp.load(hanlp.pretrained.mtl.
                       CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH,
                       verbose=True)
    tasks = list(HanLP.tasks.keys())
    print(tasks)
    for task in tasks:
        if task not in TASK:
            del HanLP[task]
    tok = HanLP[TASK[0]]
    tok.dict_combine = {'新冠', '新冠病毒', '新冠肺炎'}
    print(
        HanLP(
            "中新网3月31日电 据云南省委宣传部微博消息,3月31日晚,瑞丽市新冠肺炎疫情防控第二场新闻发布会召开,通报瑞丽市新冠肺炎疫防控情最新情况。"
        )["tok/coarse"])
    pass
Пример #4
0
 def __init__(self):
     self.recognizer = hanlp.load(hanlp.pretrained.ner.MSRA_NER_BERT_BASE_ZH)
     self.max_sent_len = 126
     self.ent_type_map = {
         'NR': '人物',
         'NT': '机构'
     }
     self.black_list = {'公司'}
Пример #5
0
 def __init__(self):
     self.tokenizer = hanlp.load('PKU_NAME_MERGED_SIX_MONTHS_CONVSEG')
     self.stopDict = {}
     # StopWords List comes from: https://github.com/goto456/stopwords
     # Read stopwords into dict to ensure constant reference consumption
     with open(stop_words_path, 'r') as f:
         for word in f.readlines():
             self.stopDict[word.strip()] = True
Пример #6
0
def run():
    recognizer = hanlp.load(hanlp.pretrained.ner.MSRA_NER_ALBERT_BASE_ZH)
    res = test_time()
    ret = []
    for i in res:
        i = i.split(',')
        for k in i:
            ret.append(list(k))
        recognizer(ret)
Пример #7
0
def load_model(model_name):
    task = find_task_model(model_name)
    if task == 'classification':
        task = 'classifiers'
    if task == None or task in ['cws']:
        return print('Model does not exist in HanLP')
    module = getattr(hanlp.pretrained, task)
    module = getattr(module, model_name)
    model = hanlp.load(module)
    model.name = model_name
    return model
Пример #8
0
def load_tokenizer(language, **kwargs):
    '''
    Only option for kwargs is chinese_tok. Default is chinese_tok = 'CTB6_CONVSEG'
    Several models for chinese tokenization are available (check dir(hanlp.pretrained.cws))
    '''
    if language == 'chinese' or language == 'zh':
        if not kwargs or 'chinese_tok' not in kwargs.keys():
            kwargs = {'chinese_tok': 'CTB6_CONVSEG'}
        tokenizer = hanlp.load(kwargs['chinese_tok'])
    elif language == 'english' or language == 'en':
        tokenizer = hanlp.utils.rules.tokenize_english
    return tokenizer
Пример #9
0
def tokenizer_pku():
    f = open('dict.json', 'r', encoding='utf-8')
    dict = json.load(f)
    f.close()
    trie = Trie()
    trie.update(dict)
    print(type(trie))
    text = 'NLP统计模型没有加规则,聪明人知道自己加。英文、数字、自定义词典统统都是规则。'
    print(split_sents(text, trie))
    tokenizer = hanlp.load('PKU_NAME_MERGED_SIX_MONTHS_CONVSEG')
    tokenizer = hanlp.pipeline() \
        .append(split_sents, output_key=('parts', 'offsets', 'words'), trie=trie) \
        .append(tokenizer, input_key='parts', output_key='tokens') \
        .append(merge_parts, input_key=('tokens', 'offsets', 'words'), output_key='merged')
    print(tokenizer(text))
Пример #10
0
    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        labels = self.get_labels()
        import hanlp
        tokenizer = hanlp.load('LARGE_ALBERT_BASE')
        for i, line in enumerate(lines):
            line_split = line.split('###')
            if i % 10 == 0:
                print(f"完成了{i}条处理")
            if len(line_split) != 3:
                continue
            content, keyword, labelid = line_split
            text_a = content
            # 通过hanlp拆分句子,获取每个词,每个词作为句子b
            keywords = tokenizer(text_a)
            for kidx, newkey in enumerate(keywords):
                guid = "%s-%s-%s" % (set_type, i, kidx)
                text_b = newkey
                #如果hanlp拆分的keyword的,这个keyword和我们样本标记的不一样,那么作为负样本,设置为2,即为other类别
                if keyword not in newkey:
                    newlabel = 2
                else:
                    newlabel = labelid
                # label从 【-1,0,1】 --> [0,1,2]
                label_id = int(newlabel)
                # label_id --> NEG, NEU, POS
                label = labels[label_id]
                examples.append(
                    InputExample(guid=guid,
                                 text_a=text_a,
                                 text_b=text_b,
                                 label=label))

            #如果keyword是None,是我们自定义的关键字,就是预测的时候,不要加入到样本集了
            if keyword == "None":
                continue
            #把原本的keyword也加入
            guid = "%s-%s-%s" % (set_type, i, kidx + 1)
            label_id = int(labelid)
            # label_id --> NEG, NEU, POS
            label = labels[label_id]
            examples.append(
                InputExample(guid=guid,
                             text_a=text_a,
                             text_b=keyword,
                             label=label))
        return examples
Пример #11
0
def bert_ner():
    recognizer = hanlp.load(hanlp.pretrained.ner.MSRA_NER_BERT_BASE_ZH)
    fp = open('test2.txt', 'r', encoding='utf-8')
    data1 = fp.read()
    fp.close()
    # list_data = data.split(",")
    list_data = re.split(r'[,。;\s]\s*', data1)
    data2 = []
    for li in list_data:
        data2.append(list(li))
    rec = recognizer(data2)
    print(rec)
    with open('bert_ner.txt', 'w', encoding='utf-8') as f:
        for r in rec:
            for i in r:
                if len(i[0]) > 1:
                    f.writelines(i[0] + '\n')
Пример #12
0
def evaluate(result_path='logs'):
    strings = []
    choices = random.choices(range(len(os.listdir('datas'))), k=10)
    for idx, p in enumerate(os.listdir('datas')):
        if idx in choices:
            strings.append(
                json.load(open('datas/' + p))['dialogues']['1']['content'])

    for key in hanlp.pretrained.ALL:
        print(key)
        try:
            tokenizer = hanlp.load(key)
            for idx, string in enumerate(strings):
                f = open(f'{result_path}/{idx}.txt', 'a')
                f.write(f'>>>>>\n{key}\n{" ".join(tokenizer(string))}\n')
        except:
            pass
Пример #13
0
def load_word_segmentation_tool():
    """
    加载分词工具
    :return: HanLP: hanlp, ltp: LTP
    """
    logger.info("loading word segmentation tool")
    # HanLP = HanLPClient(url='https://www.hanlp.com/api', auth='MTE4QGJicy5oYW5scC5jb206MXFFOHhWUkJNQXBNdlh0NA==')
    HanLP = hanlp.load(hanlp.pretrained.mtl.
                       CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH,
                       verbose=True)
    tasks = list(HanLP.tasks.keys())
    for task in tasks:
        if task not in TASK:
            del HanLP[task]
    tok = HanLP[TASK[0]]
    tok.dict_combine = {'新冠', '新冠病毒', '新冠肺炎'}
    ltp = LTP()
    logger.info("loaded word segmentation tool")
    return HanLP, ltp
Пример #14
0
def cws_data():
    tokenizer = hanlp.load("PKU_NAME_MERGED_SIX_MONTHS_CONVSEG")
    pipeline = hanlp.pipeline() \
        .append(split_sentence, output_key="sentences") \
        .append(tokenizer, output_key="tokens")
    book_results = {}

    for author in book_list:
        author_dir = data_dir / author
        for book in book_list[author]:
            book_res_file = author_dir / "{}.json".format(book)
            if book_res_file.exists():
                continue
            print("Processing: {} of {}".format(book, author))
            book_file = author_dir / "{}.clean.txt".format(book)
            book_text = book_file.read_text(encoding="utf-8")
            book_res = pipeline(book_text)
            book_results[book] = book_res
            with book_res_file.open(mode="w") as f:
                json.dump(book_res, f)
            print("Processing finished: {} of {}".format(book, author))
Пример #15
0
def segment(fin, fon):
    '''
    Segment (tokenize) a Chinese text and save it 
    Input:
        input file (path + file), encoded in UTF-8
    Output:
        tokenized file (path + file), encoded in UTF-8
    '''
    if not os.path.isfile(fin):
        print(f"Sorry, file {fin} doesn't exist!")
        return None
    if os.path.isfile(fon):
        print(f"Sorry, file {fon} already exists!")
        return None

    delim = ' '  # delimiter between tokens
    tokenizer = hanlp.load('CTB6_CONVSEG')

    n = 0
    with open(fon, "w", encoding='UTF-8', newline="\n") as fo:
        with open(fin, "r", encoding='UTF-8') as fi:
            cnt = 0
            batch = ''
            for line in fi:
                cnt += 1
                batch += line
                n += 1
                if cnt % BATCH_SIZE == 0:
                    tokens = tokenizer(batch)
                    batch_out = delim.join(tokens).replace(
                        f'{delim}\n{delim}', '\n')
                    fo.write(batch_out)
                    batch = ''
                if n % 1000 == 0:
                    fo.flush()
                    print(f"{n} lines processed...")
            tokens = tokenizer(batch)
            batch_out = delim.join(tokens).replace(f'{delim}\n{delim}', '\n')
            fo.write(batch_out)
Пример #16
0
def seg_with_han200(in_file, out_file_path, manual_seg_file):
    # initialization model
    tokenizer = hanlp.load("PKU_NAME_MERGED_SIX_MONTHS_CONVSEG")  # 以默认配置加载模型

    # save seg_result
    corpus = construct_corpus(in_file)
    f = open(out_file_path, "w", encoding='utf-8')
    for line in corpus:
        f.write("=".join(tokenizer(line)) + "\n")
        f.flush()

    # test qps 百度暂时不计算,因为加了延时
    corpus = construct_corpus(in_file, 1)
    start = time.time()
    for line in corpus:
        tokenizer(line)
    end = time.time()
    qps = round(len(corpus) / (end - start), 2)

    # test accuracy
    p, r, f1, line_aver_length = evaluate(out_file_path, manual_seg_file)
    return qps, p, r, f1, line_aver_length
Пример #17
0
def ner():
    data = count()
    ner_output = ""
    data2 = []
    text = ""
    page = ""
    if request.method == "POST":
        ner_post = request.form['ner_post']
        text = ner_post
        # print(ner_post)
        try:
            # tokenizer = hanlp.load('PKU_NAME_MERGED_SIX_MONTHS_CONVSEG')
            # ner_output = tokenizer(ner_post)
            recognizer = hanlp.load(hanlp.pretrained.ner.MSRA_NER_BERT_BASE_ZH)
            list_data = re.split(r'[,。;\s]\s*', ner_post)
            data1 = []
            for li in list_data:
                data1.append(list(li))
            ner_output = recognizer(data1)
            for n in ner_output:
                for i in n:
                    if len(i[0]) > 1:
                        data2.append(i[0])
            ner_output = data2
            src = str(ner_post)
            for i in data2:
                temp = src.replace(str(i), str("<mark>" + i + "</mark>"))
                src = temp
            page = "<p>" + src + "</p>"
            # print(page)
        except BaseException as e:
            print('error: ' + str(e))
    return render_template('ner.html',
                           data=data,
                           text=text,
                           ner_output=ner_output,
                           page=page)
Пример #18
0
 def load_model(self, oss_conf: namedtuple):
     download_model_from_oss(oss_conf, MTL_MODEL_KEY, SAVE_DIR)
     model_dir = os.path.join(
         SAVE_DIR,
         os.path.basename(MTL_MODEL_KEY).strip(".tar.gz"))
     self.mlt = hanlp.load(model_dir)
Пример #19
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-31 03:24
import hanlp

tokenizer = hanlp.load('PKU_NAME_MERGED_SIX_MONTHS_CONVSEG')
tagger = hanlp.load('CTB5_POS_RNN_FASTTEXT_ZH')
syntactic_parser = hanlp.load('CTB7_BIAFFINE_DEP_ZH')
semantic_parser = hanlp.load('SEMEVAL16_TEXT_BIAFFINE_ZH')

pipeline = hanlp.pipeline() \
    .append(hanlp.utils.rules.split_sentence, output_key='sentences') \
    .append(tokenizer, output_key='tokens') \
    .append(tagger, output_key='part_of_speech_tags') \
    .append(syntactic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='syntactic_dependencies') \
    .append(semantic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='semantic_dependencies')
print(pipeline)

text = '''HanLP是一系列模型与算法组成的自然语言处理工具包,目标是普及自然语言处理在生产环境中的应用。
HanLP具备功能完善、性能高效、架构清晰、语料时新、可自定义的特点。
内部算法经过工业界和学术界考验,配套书籍《自然语言处理入门》已经出版。
'''

doc = pipeline(text)
print(doc)

# You can save the config to disk for deploying or sharing.
pipeline.save('zh.json')
# Then load it smoothly.
deployed = hanlp.components.Pipeline.from_meta('zh.json')
print(deployed)
Пример #20
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-01-18 11:09
from hanlp_common.document import Document
import hanlp

con = hanlp.load(hanlp.pretrained.constituency.CTB9_CON_FULL_TAG_ELECTRA_SMALL)
# To speed up, parse multiple sentences at once, and use a GPU.
print(
    con([
        "2021年", "HanLPv2.1", "带来", "最", "先进", "的", "多", "语种", "NLP", "技术", "。"
    ]))


# The rest of this tutorial is written for clever users.
# The first level of non-terminals are PoS tags. So usually a PoS model is piped.
def merge_pos_into_con(doc: Document):
    flat = isinstance(doc['pos'][0], str)
    if flat:
        doc = Document((k, [v]) for k, v in doc.items())
    for tree, tags in zip(doc['con'], doc['pos']):
        offset = 0
        for subtree in tree.subtrees(lambda t: t.height() == 2):
            tag = subtree.label()
            if tag == '_':
                subtree.set_label(tags[offset])
            offset += 1
    if flat:
        doc = doc.squeeze()
    return doc
    # eval('test_path' + sys.argv[2])字符串转变量
    # eval('seg_path' + sys.argv[2])

    print('test_path=', test_path)
    print('segment_path=', seg_path)

    if segment_tool == 'jieba':
        jieba.enable_paddle()  # 启动paddle模式。 0.40版之后开始支持,早期版本不支持

    segment_func = segment_tool + '_segment'
    start = time.perf_counter()

    if 'thulac' in segment_func:
        thu = thulac.thulac(seg_only=True)
    if 'hanlp' in segment_func:
        han_tokenizer = hanlp.load('PKU_NAME_MERGED_SIX_MONTHS_CONVSEG')
    with open(test_path, 'r', encoding='utf-8-sig') as f_r:  ##注意,这里的编码,utf-8 bom会在文件头加\ufeff,否则会有小问题
        with open(seg_path, 'w', encoding='utf-8') as f_w:
            for line_sentence in f_r:
                if 'thulac' in segment_func:
                    line_sentence = getattr(Segment(), segment_func)(thu, line_sentence)  # 根据参数调用不同分词工具
                elif 'hanlp' in segment_func:
                    line_sentence = getattr(Segment(), segment_func)(han_tokenizer, line_sentence)
                else:
                    line_sentence = getattr(Segment(), segment_func)(line_sentence)  # 根据参数调用不同分词工具
                if 'snow' in segment_func:
                    f_w.write(line_sentence + '\n')
                else:
                    f_w.write(line_sentence)
    end = time.perf_counter()
Пример #22
0
import hanlp
import unittest
from multiprocessing.dummy import Pool
from hanlp_common.document import Document

mtl = hanlp.load(
    hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH,
    devices=-1)


def tokenize(mtl, text):
    return mtl(text, tasks='tok/fine')['tok/fine']


class TestMultiTaskLearning(unittest.TestCase):
    def test_mtl_single_sent(self):
        doc: Document = mtl('商品和服务')
        self.assertSequenceEqual(doc['tok/fine'], ["商品", "和", "服务"])

    def test_mtl_multiple_sents(self):
        doc: Document = mtl(['商品和服务', '研究生命'])
        self.assertSequenceEqual(doc['tok/fine'],
                                 [["商品", "和", "服务"], ["研究", "生命"]])

    def test_mtl_empty_str(self):
        mtl('')
        mtl(' ')
        mtl([''])
        mtl([' '])
        mtl(['', ' '])
        mtl(['', ' ', 'good'])
Пример #23
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 21:25
import hanlp

tokenizer = hanlp.load('CTB6_CONVSEG')
print(tokenizer.predict('商品和服务'))
print(tokenizer.predict(['萨哈夫说,伊拉克将同联合国销毁伊拉克大规模杀伤性武器特别委员会继续保持合作。',
                         '上海华安工业(集团)公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。']))

text = 'NLP统计模型没有加规则,聪明人知道自己加。英文、数字、自定义词典统统都是规则。'
print(tokenizer.predict(text))

dic = {'自定义': 'custom', '词典': 'dict', '聪明人': 'smart'}


def split_by_dic(text: str):
    # We use regular expression for the sake of simplicity.
    # However, you should use some trie trees for production
    import re
    p = re.compile('(' + '|'.join(dic.keys()) + ')')
    sents, offset, words = [], 0, []
    for m in p.finditer(text):
        sents.append(text[offset: m.start()])
        words.append((m.group(), dic[m.group()]))
        offset = m.end()
    if offset < len(text):
        sents.append(text[offset:])
        words.append((None, None))
    flat = []
    for pred, (word, tag) in zip(tokenizer.predict(sents), words):
Пример #24
0
#coding=utf-8
from py2neo import *
from flask import *
import json
import jieba
import hanlp
import re
from sim import proc
import codecs
import math
app = Flask(__name__)
from flask_cors import *
CORS(app,supports_credentials=True)
graph = Graph('http://47.96.143.66:7474',username='******',password='******')
recognizer = hanlp.load(hanlp.pretrained.ner.MSRA_NER_BERT_BASE_ZH)
stop_words_file = 'data/stop_words.txt'
stopwords = [x.strip() for x in codecs.open(stop_words_file, 'r', encoding='utf8').readlines()]

#cache_pre = [x.strip() for x in codecs.open("cache.txt",'r',encoding='utf8').readlines()]
print("Cache Loading......")
cache = []
cache_hint = {}
cache_len = 0

def filter(senq):
	str = re.sub("[A-Za-z0-9\!\%\[\]\,\。]", "", senq)
	return str
def cont(lst, str):
	cond = ''
	leng = 2048
	for item in lst:
Пример #25
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 21:25
import hanlp

tokenizer = hanlp.load(hanlp.pretrained.cws.LARGE_ALBERT_BASE)
print(tokenizer('商品和服务'))
print(
    tokenizer([
        '萨哈夫说,伊拉克将同联合国销毁伊拉克大规模杀伤性武器特别委员会继续保持合作。',
        '上海华安工业(集团)公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。',
        'HanLP支援臺灣正體、香港繁體,具有新詞辨識能力的中文斷詞系統'
    ]))

text = 'NLP统计模型没有加规则,聪明人知道自己加。英文、数字、自定义词典统统都是规则。'
print(tokenizer(text))

dic = {'自定义': 'custom', '词典': 'dict', '聪明人': 'smart'}


def split_by_dic(text: str):
    # We use regular expression for the sake of simplicity.
    # However, you should use some trie trees for production
    import re
    p = re.compile('(' + '|'.join(dic.keys()) + ')')
    sents, offset, words = [], 0, []
    for m in p.finditer(text):
        if offset < m.start():
            sents.append(text[offset:m.start()])
            words.append((m.group(), dic[m.group()]))
            offset = m.end()
Пример #26
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-31 03:24

import hanlp

tokenizer = hanlp.load('CTB6_CONVSEG')
tagger = hanlp.load('CTB5_POS_RNN_FASTTEXT_ZH')
syntactic_parser = hanlp.load('CTB7_BIAFFINE_DEP_ZH')
semantic_parser = hanlp.load('SEMEVAL16_TEXT_BIAFFINE_ZH')

pipeline = hanlp.pipeline() \
    .append(hanlp.utils.rules.split_sentence, output_key='sentences') \
    .append(tokenizer, output_key='tokens') \
    .append(tagger, output_key='part_of_speech_tags') \
    .append(syntactic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='syntactic_dependencies', conll=False) \
    .append(semantic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='semantic_dependencies', conll=False)
print(pipeline)

text = '''HanLP是一系列模型与算法组成的自然语言处理工具包,目标是普及自然语言处理在生产环境中的应用。
HanLP具备功能完善、性能高效、架构清晰、语料时新、可自定义的特点。
内部算法经过工业界和学术界考验,配套书籍《自然语言处理入门》已经出版。
'''

doc = pipeline(text)
print(doc)
# By default the doc is json serializable, it holds true if your pipes output json serializable object too.
# print(json.dumps(doc, ensure_ascii=False, indent=2))

# You can save the config to disk for deploying or sharing.
pipeline.save('zh.json')
Пример #27
0
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import tensorflow
import hanlp
import use_case_diagram

tokenizer = hanlp.load('LARGE_ALBERT_BASE')
tagger = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ALBERT_BASE)
semantic_parser = hanlp.load(hanlp.pretrained.sdp.SEMEVAL16_NEWS_BIAFFINE_ZH)
#如果客户之前用信用卡付款,而其信用卡账户退还交易被拒绝,则告知顾客并使用现金退款。
#摘要:顾客携带所购商品到达收银台,收银员使用pos系统记录每件商品。系统连续显示累计总额,并逐行显示细目。顾客输入支付信息,系统对支付信息进行验证和记录。系统更新库存信息。顾客从系统中得到购物小票,然后携带商品离开。如果客户之前用信用卡付款,而其信用卡账户退还交易被拒绝,则告知顾客并使用现金退款。如果在系统中未查找到该商品的标识码,则提示收银员并建议手工输入标识码。
#学生使用JXG系统查询新学期将开设的课程和授课教师的情况

#toker=tokenizer('收银员使用pos系统记录每件商品')
#系统管理员可以运用的功能,像修改密码,管理学生信息、成绩信息、课程信息、班级信息并且设置权限。


def POS_ana(text):
    toker = tokenizer(text)

    tag = tagger(toker)
    return tag


def DP_ana(text):
    toker = tokenizer(text)
    tag = tagger(toker)

    argu = []
    for i in range(len(toker)):
Пример #28
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-08 04:43
# pip3 install tensorflow-serving-api-gpu
import grpc
import tensorflow as tf
from tensorflow_core.python.framework import tensor_util
from tensorflow_serving.apis import predict_pb2, prediction_service_pb2_grpc
import hanlp
from hanlp.common.component import KerasComponent

tagger: KerasComponent = hanlp.load(hanlp.pretrained.pos.CTB5_POS_RNN)
transform = tagger.transform
del tagger

inputs = [['商品', '和', '服务'], ['我', '的', '希望', '是', '希望', '和平']]

samples = next(iter(transform.inputs_to_dataset(inputs)))[0]
print(samples)

channel = grpc.insecure_channel('{host}:{port}'.format(host='localhost',
                                                       port=8500))
stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
request = predict_pb2.PredictRequest()
request.model_spec.name = 'ctb5_pos_rnn_20191229_015325'
request.model_spec.signature_name = 'serving_default'
request.inputs['embedding_input'].CopyFrom(
    tf.make_tensor_proto(samples, dtype=tf.float32))
result = stub.Predict(request, 10.0)  # 10 secs timeout
print(result)
prediction = tensor_util.MakeNdarray(result.outputs['dense'])
Пример #29
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-15 22:26
import hanlp
from hanlp.components.tokenizers.transformer import TransformerTaggingTokenizer

# 加载一个旧版本单任务模型演示分词错误(最新版已经修复):
tok: TransformerTaggingTokenizer = hanlp.load('https://file.hankcs.com/hanlp/tok/coarse_electra_small_20220220_013548.zip')

tok.dict_force = tok.dict_combine = None
print(f'不挂词典:\n{tok("首相和川普通电话")}')

tok.dict_force = {'川普'}
print(f'强制模式:\n{tok(["首相和川普通电话", "银川普通人与川普通电话讲四川普通话"])}')  # 慎用,详见《自然语言处理入门》第二章

tok.dict_force = {'川普通电话': ['川普', '通', '电话']}
print(f'强制校正:\n{tok(["首相和川普通电话", "银川普通人与川普通电话讲四川普通话"])}')

tok.dict_force = None
tok.dict_combine = {'美国总统'}
print(f'合并模式:\n{tok("首相和川普通电话,川普是美国总统。")}')

# 需要算法基础才能理解,初学者可参考 http://nlp.hankcs.com/book.php
# See also https://hanlp.hankcs.com/docs/api/hanlp/components/tokenizers/transformer.html
Пример #30
0
 def setUp(self) -> None:
     super().setUp()
     self.mtl = hanlp.load(
         hanlp.pretrained.mtl.
         OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH,
         devices=-1)