示例#1
0
    def __init__(self, course_list: CourseList):
        self.pipeline = hanlp.pipeline()
        self.course_list = course_list
        self.course_dict = {}
        self.result = None

        self.pipeline_init()
示例#2
0
def tokenizer_pku():
    f = open('dict.json', 'r', encoding='utf-8')
    dict = json.load(f)
    f.close()
    trie = Trie()
    trie.update(dict)
    print(type(trie))
    text = 'NLP统计模型没有加规则,聪明人知道自己加。英文、数字、自定义词典统统都是规则。'
    print(split_sents(text, trie))
    tokenizer = hanlp.load('PKU_NAME_MERGED_SIX_MONTHS_CONVSEG')
    tokenizer = hanlp.pipeline() \
        .append(split_sents, output_key=('parts', 'offsets', 'words'), trie=trie) \
        .append(tokenizer, input_key='parts', output_key='tokens') \
        .append(merge_parts, input_key=('tokens', 'offsets', 'words'), output_key='merged')
    print(tokenizer(text))
 def __init__(self,
              hanlp_tokenizer,
              hanlp_tagger,
              user_dict_path,
              stop_words_path,
              consider_tags_path,
              ignore_tag='-'):
     self.hanlp_tokenizer = hanlp_tokenizer
     self.tagger = hanlp_tagger
     self.ignore_tag = ignore_tag
     self.stop_words = self.load_stop_words(stop_words_path)
     self.considered_tags = self.load_consider_tags(consider_tags_path)
     self.user_dict = self.load_user_dict(user_dict_path)
     self.trie = Trie()
     self.trie.update(self.user_dict)
     self.tokenizer = hanlp.pipeline() \
         .append(self.split_sentences, output_key=('parts', 'offsets', 'words')) \
         .append(self.hanlp_tokenizer, input_key='parts', output_key='tokens') \
         .append(self.merge_parts, input_key=('tokens', 'offsets', 'words'), output_key='merged')
示例#4
0
def cws_data():
    tokenizer = hanlp.load("PKU_NAME_MERGED_SIX_MONTHS_CONVSEG")
    pipeline = hanlp.pipeline() \
        .append(split_sentence, output_key="sentences") \
        .append(tokenizer, output_key="tokens")
    book_results = {}

    for author in book_list:
        author_dir = data_dir / author
        for book in book_list[author]:
            book_res_file = author_dir / "{}.json".format(book)
            if book_res_file.exists():
                continue
            print("Processing: {} of {}".format(book, author))
            book_file = author_dir / "{}.clean.txt".format(book)
            book_text = book_file.read_text(encoding="utf-8")
            book_res = pipeline(book_text)
            book_results[book] = book_res
            with book_res_file.open(mode="w") as f:
                json.dump(book_res, f)
            print("Processing finished: {} of {}".format(book, author))
示例#5
0
import json

app = Flask(__name__)

# 加载分词模型
tokenizer = hanlp.load('LARGE_ALBERT_BASE')
# 词性标注
tagger = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ALBERT_BASE)
# 依存句法分析
syntactic_parser = hanlp.load(hanlp.pretrained.dep.CTB7_BIAFFINE_DEP_ZH)
# 语义依存分析
semantic_parser = hanlp.load(hanlp.pretrained.sdp.SEMEVAL16_NEWS_BIAFFINE_ZH)
# 流水线
pipeline = hanlp.pipeline() \
    .append(hanlp.utils.rules.split_sentence, output_key='sentences') \
    .append(tokenizer, output_key='tokens') \
    .append(tagger, output_key='part_of_speech_tags') \
    .append(syntactic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='syntactic_dependencies') \
    .append(semantic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='semantic_dependencies')

@app.route("/nlp", methods=["POST"])
def check():
    # 默认返回内容

    return_dict = {'code': 200,
                   'message': 'success', 'result': False}
    # 获取传入的参数
    get_Data=request.get_data()
    # 传入的参数为bytes类型,需要转化成json
    get_Data=json.loads(get_Data)
    content = get_Data.get('content')
    res = pipeline(content)
示例#6
0
        doc = Document((k, [v]) for k, v in doc.items())
    for tree, tags in zip(doc['con'], doc['pos']):
        offset = 0
        for subtree in tree.subtrees(lambda t: t.height() == 2):
            tag = subtree.label()
            if tag == '_':
                subtree.set_label(tags[offset])
            offset += 1
    if flat:
        doc = doc.squeeze()
    return doc


pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)
nlp = hanlp.pipeline() \
    .append(pos, input_key='tok', output_key='pos') \
    .append(con, input_key='tok', output_key='con') \
    .append(merge_pos_into_con, input_key='*')
print(f'The pipeline looks like this: {nlp}')
doc = nlp(tok=[
    "2021年", "HanLPv2.1", "带来", "最", "先进", "的", "多", "语种", "NLP", "技术", "。"
])
print(doc)
doc.pretty_print()

# If you need to parse raw text, simply add a tokenizer into this pipeline.
tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
nlp.insert(0, tok, output_key='tok')
print(f'The pipeline looks like this: {nlp}')
doc = nlp('2021年HanLPv2.1带来最先进的多语种NLP技术。')
print(doc)
doc.pretty_print()
示例#7
0
def setup_initialise(sender, **kwargs):
    # Initialise storage session connection
    try:
        _session_pool = {
            "redis": RedisBase().Session,
            "mysql": MysqlBase().Session,
            "elastic": ElasticBase().Session
        }
        setattr(sender, "_session_pool", _session_pool)
        log.info(
            "successes load backend session pool on deadpool app at on_configure.connect"
        )
    except Exception as e:
        log.error(e)

    # Initialise HanLP process pipelines
    try:
        tokenizer = hanlp.load('CTB6_CONVSEG')
        tagger = hanlp.load('CTB5_POS_RNN_FASTTEXT_ZH')
        syntactic_parser = hanlp.load('CTB7_BIAFFINE_DEP_ZH')
        semantic_parser = hanlp.load('SEMEVAL16_TEXT_BIAFFINE_ZH')
        _hanlp_pipeline = hanlp.pipeline() \
            .append(hanlp.utils.rules.split_sentence, output_key='sentences') \
            .append(tokenizer, output_key='tokens') \
            .append(tagger, output_key='part_of_speech_tags') \
            .append(syntactic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='syntactic_dependencies', conll=False) \
            .append(semantic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='semantic_dependencies', conll=False)
        setattr(sender, "_hanlp_pipeline", _hanlp_pipeline)
        log.info(
            "successes load hanlp process pipeline on deadpool app at on_configure.connect"
        )
    except Exception as e:
        log.error(e)

    # Initialise HanLP NER recognizer model
    try:
        # 加载模型
        _hanlp_ner_recognizer = hanlp.load(
            hanlp.pretrained.ner.MSRA_NER_BERT_BASE_ZH)
        setattr(sender, "_hanlp_ner_recognizer", _hanlp_ner_recognizer)
        log.info(
            "successes load hanlp NER recognizer model on deadpool app at on_configure.connect"
        )
    except Exception as e:
        log.error(e)

    # Initialise pkuseg token segmentation toolkit
    try:
        # 加载模型
        _pkuseg_toolkit = pkuseg.pkuseg(user_dict=os.path.join(
            cur_dir, "data", "custom", 'pkuseg_user_dict.txt'),
                                        postag=True)
        setattr(sender, "_pkuseg_toolkit", _pkuseg_toolkit)
        log.info(
            "successes load PKUseg token segmentation toolkit on deadpool app at on_configure.connect"
        )
    except Exception as e:
        log.error(e)

    # Initialise stopwords for segmentation usage
    try:
        # cn_stopwords.txt    中文常用停用词
        # hit_stopwords.txt   哈工大停用词
        # scu_stopwords.txt   四川大学机器智能实验室停用词
        # baidu_stopwords.txt 百度停用词
        stopwords = []
        data_sets = glob.glob(
            os.path.join(cur_dir, 'data', 'stopwords', '*.txt'))
        for item in data_sets:
            with open(item, encoding="utf-8") as f:
                for line in f:
                    stopwords.append(line.strip())
        _stopwords = list(set(stopwords))
        setattr(sender, "_stopwords", _stopwords)
        log.info(
            "successes load stopwords for segmentation usage on deadpool app at on_configure.connect"
        )
    except Exception as e:
        log.error(e)
示例#8
0
    words = trie.parse_longest(text)
    sents = []
    pre_start = 0
    offsets = []
    for start, end, value in words:
        if pre_start != start:
            sents.append(text[pre_start:start])
            offsets.append(pre_start)
        pre_start = end
    if pre_start != len(text):
        sents.append(text[pre_start:])
        offsets.append(pre_start)
    return sents, offsets, words


print(split_sents(text, trie))


def merge_parts(parts, offsets, words):
    items = [(i, p) for (i, p) in zip(offsets, parts)]
    items += [(start, [value]) for (start, end, value) in words]
    return [each for x in sorted(items) for each in x[1]]


tokenizer = hanlp.pipeline() \
    .append(split_sents, output_key=('parts', 'offsets', 'words'), trie=trie) \
    .append(tokenizer, input_key='parts', output_key='tokens') \
    .append(merge_parts, input_key=('tokens', 'offsets', 'words'), output_key='merged')

print(tokenizer(text))
示例#9
0
 def __init__(self):
     self.tokenizer = hanlp.load('LARGE_ALBERT_BASE')
     self.pipeline = hanlp.pipeline() \
         .append(hanlp.utils.rules.split_sentence, output_key="sentences") \
         .append(self.tokenizer, output_key="tokens")
示例#10
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-28 20:47
import hanlp

# Pipeline allows blending multiple callable functions no matter they are a rule, a TensorFlow component or a PyTorch
# one. However, it's slower than the MTL framework.
# pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ALBERT_BASE)  # In case both tf and torch are used, load tf first.

HanLP = hanlp.pipeline() \
    .append(hanlp.utils.rules.split_sentence, output_key='sentences') \
    .append(hanlp.load('CTB9_TOK_ELECTRA_SMALL'), output_key='tok') \
    .append(hanlp.load('CTB9_POS_ELECTRA_SMALL'), output_key='pos') \
    .append(hanlp.load('MSRA_NER_ELECTRA_SMALL_ZH'), output_key='ner', input_key='tok') \
    .append(hanlp.load('CTB9_DEP_ELECTRA_SMALL', conll=False), output_key='dep', input_key='tok') \
    .append(hanlp.load('CTB9_CON_ELECTRA_SMALL'), output_key='con', input_key='tok')

doc = HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。')
print(doc)
doc.pretty_print()
示例#11
0
    print('splitted with trie:')
    print(split_sents(text, trie))
    print()

    def merge_parts(parts, offsets, words):
        items = [(i, p) for (i, p) in zip(offsets, parts)]
        items += [(start, [word]) for (word, value, start, end) in words]
        # In case you need the tag, use the following line instead
        # items += [(start, [(word, value)]) for (word, value, start, end) in words]
        return [each for x in sorted(items) for each in x[1]]

    # pipeline_tokenizer = hanlp.pipeline() \
    #     .append(hanlp.utils.rules.split_sentence, output_key='sentences') \
    #     .append(tokenizer, output_key='tokens')

    pipeline_splitter = hanlp.pipeline() \
        .append(split_sents, output_key=('parts', 'offsets', 'words'), trie=trie)
    splitted = pipeline_splitter(text)

    print('splitted:')
    pprint.pprint(splitted)
    print()

    pipeline_tokenizer = pipeline_splitter.append(tokenizer,
                                                  input_key='parts',
                                                  output_key='tokens')
    tokenized = pipeline_tokenizer(text)

    print('tokenized:')
    pprint.pprint(tokenized)
    print()
示例#12
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-28 20:47
import hanlp

# Pipeline allows to blend multiple callable functions no matter they are a rule, a TensorFlow component or a PyTorch
# one. However, it's slower than the MTL framework.
pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ALBERT_BASE
                 )  # In case both tf and torch are used, load tf first
tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)

pipeline = hanlp.pipeline() \
    .append(hanlp.utils.rules.split_sentence, output_key='sentences') \
    .append(tok, output_key='tok') \
    .append(pos, output_key='pos')

doc = pipeline('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。')
print(doc)
doc.pretty_print()