def __init__(self, course_list: CourseList): self.pipeline = hanlp.pipeline() self.course_list = course_list self.course_dict = {} self.result = None self.pipeline_init()
def tokenizer_pku(): f = open('dict.json', 'r', encoding='utf-8') dict = json.load(f) f.close() trie = Trie() trie.update(dict) print(type(trie)) text = 'NLP统计模型没有加规则,聪明人知道自己加。英文、数字、自定义词典统统都是规则。' print(split_sents(text, trie)) tokenizer = hanlp.load('PKU_NAME_MERGED_SIX_MONTHS_CONVSEG') tokenizer = hanlp.pipeline() \ .append(split_sents, output_key=('parts', 'offsets', 'words'), trie=trie) \ .append(tokenizer, input_key='parts', output_key='tokens') \ .append(merge_parts, input_key=('tokens', 'offsets', 'words'), output_key='merged') print(tokenizer(text))
def __init__(self, hanlp_tokenizer, hanlp_tagger, user_dict_path, stop_words_path, consider_tags_path, ignore_tag='-'): self.hanlp_tokenizer = hanlp_tokenizer self.tagger = hanlp_tagger self.ignore_tag = ignore_tag self.stop_words = self.load_stop_words(stop_words_path) self.considered_tags = self.load_consider_tags(consider_tags_path) self.user_dict = self.load_user_dict(user_dict_path) self.trie = Trie() self.trie.update(self.user_dict) self.tokenizer = hanlp.pipeline() \ .append(self.split_sentences, output_key=('parts', 'offsets', 'words')) \ .append(self.hanlp_tokenizer, input_key='parts', output_key='tokens') \ .append(self.merge_parts, input_key=('tokens', 'offsets', 'words'), output_key='merged')
def cws_data(): tokenizer = hanlp.load("PKU_NAME_MERGED_SIX_MONTHS_CONVSEG") pipeline = hanlp.pipeline() \ .append(split_sentence, output_key="sentences") \ .append(tokenizer, output_key="tokens") book_results = {} for author in book_list: author_dir = data_dir / author for book in book_list[author]: book_res_file = author_dir / "{}.json".format(book) if book_res_file.exists(): continue print("Processing: {} of {}".format(book, author)) book_file = author_dir / "{}.clean.txt".format(book) book_text = book_file.read_text(encoding="utf-8") book_res = pipeline(book_text) book_results[book] = book_res with book_res_file.open(mode="w") as f: json.dump(book_res, f) print("Processing finished: {} of {}".format(book, author))
import json app = Flask(__name__) # 加载分词模型 tokenizer = hanlp.load('LARGE_ALBERT_BASE') # 词性标注 tagger = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ALBERT_BASE) # 依存句法分析 syntactic_parser = hanlp.load(hanlp.pretrained.dep.CTB7_BIAFFINE_DEP_ZH) # 语义依存分析 semantic_parser = hanlp.load(hanlp.pretrained.sdp.SEMEVAL16_NEWS_BIAFFINE_ZH) # 流水线 pipeline = hanlp.pipeline() \ .append(hanlp.utils.rules.split_sentence, output_key='sentences') \ .append(tokenizer, output_key='tokens') \ .append(tagger, output_key='part_of_speech_tags') \ .append(syntactic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='syntactic_dependencies') \ .append(semantic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='semantic_dependencies') @app.route("/nlp", methods=["POST"]) def check(): # 默认返回内容 return_dict = {'code': 200, 'message': 'success', 'result': False} # 获取传入的参数 get_Data=request.get_data() # 传入的参数为bytes类型,需要转化成json get_Data=json.loads(get_Data) content = get_Data.get('content') res = pipeline(content)
doc = Document((k, [v]) for k, v in doc.items()) for tree, tags in zip(doc['con'], doc['pos']): offset = 0 for subtree in tree.subtrees(lambda t: t.height() == 2): tag = subtree.label() if tag == '_': subtree.set_label(tags[offset]) offset += 1 if flat: doc = doc.squeeze() return doc pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL) nlp = hanlp.pipeline() \ .append(pos, input_key='tok', output_key='pos') \ .append(con, input_key='tok', output_key='con') \ .append(merge_pos_into_con, input_key='*') print(f'The pipeline looks like this: {nlp}') doc = nlp(tok=[ "2021年", "HanLPv2.1", "带来", "最", "先进", "的", "多", "语种", "NLP", "技术", "。" ]) print(doc) doc.pretty_print() # If you need to parse raw text, simply add a tokenizer into this pipeline. tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH) nlp.insert(0, tok, output_key='tok') print(f'The pipeline looks like this: {nlp}') doc = nlp('2021年HanLPv2.1带来最先进的多语种NLP技术。') print(doc) doc.pretty_print()
def setup_initialise(sender, **kwargs): # Initialise storage session connection try: _session_pool = { "redis": RedisBase().Session, "mysql": MysqlBase().Session, "elastic": ElasticBase().Session } setattr(sender, "_session_pool", _session_pool) log.info( "successes load backend session pool on deadpool app at on_configure.connect" ) except Exception as e: log.error(e) # Initialise HanLP process pipelines try: tokenizer = hanlp.load('CTB6_CONVSEG') tagger = hanlp.load('CTB5_POS_RNN_FASTTEXT_ZH') syntactic_parser = hanlp.load('CTB7_BIAFFINE_DEP_ZH') semantic_parser = hanlp.load('SEMEVAL16_TEXT_BIAFFINE_ZH') _hanlp_pipeline = hanlp.pipeline() \ .append(hanlp.utils.rules.split_sentence, output_key='sentences') \ .append(tokenizer, output_key='tokens') \ .append(tagger, output_key='part_of_speech_tags') \ .append(syntactic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='syntactic_dependencies', conll=False) \ .append(semantic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='semantic_dependencies', conll=False) setattr(sender, "_hanlp_pipeline", _hanlp_pipeline) log.info( "successes load hanlp process pipeline on deadpool app at on_configure.connect" ) except Exception as e: log.error(e) # Initialise HanLP NER recognizer model try: # 加载模型 _hanlp_ner_recognizer = hanlp.load( hanlp.pretrained.ner.MSRA_NER_BERT_BASE_ZH) setattr(sender, "_hanlp_ner_recognizer", _hanlp_ner_recognizer) log.info( "successes load hanlp NER recognizer model on deadpool app at on_configure.connect" ) except Exception as e: log.error(e) # Initialise pkuseg token segmentation toolkit try: # 加载模型 _pkuseg_toolkit = pkuseg.pkuseg(user_dict=os.path.join( cur_dir, "data", "custom", 'pkuseg_user_dict.txt'), postag=True) setattr(sender, "_pkuseg_toolkit", _pkuseg_toolkit) log.info( "successes load PKUseg token segmentation toolkit on deadpool app at on_configure.connect" ) except Exception as e: log.error(e) # Initialise stopwords for segmentation usage try: # cn_stopwords.txt 中文常用停用词 # hit_stopwords.txt 哈工大停用词 # scu_stopwords.txt 四川大学机器智能实验室停用词 # baidu_stopwords.txt 百度停用词 stopwords = [] data_sets = glob.glob( os.path.join(cur_dir, 'data', 'stopwords', '*.txt')) for item in data_sets: with open(item, encoding="utf-8") as f: for line in f: stopwords.append(line.strip()) _stopwords = list(set(stopwords)) setattr(sender, "_stopwords", _stopwords) log.info( "successes load stopwords for segmentation usage on deadpool app at on_configure.connect" ) except Exception as e: log.error(e)
words = trie.parse_longest(text) sents = [] pre_start = 0 offsets = [] for start, end, value in words: if pre_start != start: sents.append(text[pre_start:start]) offsets.append(pre_start) pre_start = end if pre_start != len(text): sents.append(text[pre_start:]) offsets.append(pre_start) return sents, offsets, words print(split_sents(text, trie)) def merge_parts(parts, offsets, words): items = [(i, p) for (i, p) in zip(offsets, parts)] items += [(start, [value]) for (start, end, value) in words] return [each for x in sorted(items) for each in x[1]] tokenizer = hanlp.pipeline() \ .append(split_sents, output_key=('parts', 'offsets', 'words'), trie=trie) \ .append(tokenizer, input_key='parts', output_key='tokens') \ .append(merge_parts, input_key=('tokens', 'offsets', 'words'), output_key='merged') print(tokenizer(text))
def __init__(self): self.tokenizer = hanlp.load('LARGE_ALBERT_BASE') self.pipeline = hanlp.pipeline() \ .append(hanlp.utils.rules.split_sentence, output_key="sentences") \ .append(self.tokenizer, output_key="tokens")
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-12-28 20:47 import hanlp # Pipeline allows blending multiple callable functions no matter they are a rule, a TensorFlow component or a PyTorch # one. However, it's slower than the MTL framework. # pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ALBERT_BASE) # In case both tf and torch are used, load tf first. HanLP = hanlp.pipeline() \ .append(hanlp.utils.rules.split_sentence, output_key='sentences') \ .append(hanlp.load('CTB9_TOK_ELECTRA_SMALL'), output_key='tok') \ .append(hanlp.load('CTB9_POS_ELECTRA_SMALL'), output_key='pos') \ .append(hanlp.load('MSRA_NER_ELECTRA_SMALL_ZH'), output_key='ner', input_key='tok') \ .append(hanlp.load('CTB9_DEP_ELECTRA_SMALL', conll=False), output_key='dep', input_key='tok') \ .append(hanlp.load('CTB9_CON_ELECTRA_SMALL'), output_key='con', input_key='tok') doc = HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。') print(doc) doc.pretty_print()
print('splitted with trie:') print(split_sents(text, trie)) print() def merge_parts(parts, offsets, words): items = [(i, p) for (i, p) in zip(offsets, parts)] items += [(start, [word]) for (word, value, start, end) in words] # In case you need the tag, use the following line instead # items += [(start, [(word, value)]) for (word, value, start, end) in words] return [each for x in sorted(items) for each in x[1]] # pipeline_tokenizer = hanlp.pipeline() \ # .append(hanlp.utils.rules.split_sentence, output_key='sentences') \ # .append(tokenizer, output_key='tokens') pipeline_splitter = hanlp.pipeline() \ .append(split_sents, output_key=('parts', 'offsets', 'words'), trie=trie) splitted = pipeline_splitter(text) print('splitted:') pprint.pprint(splitted) print() pipeline_tokenizer = pipeline_splitter.append(tokenizer, input_key='parts', output_key='tokens') tokenized = pipeline_tokenizer(text) print('tokenized:') pprint.pprint(tokenized) print()
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-12-28 20:47 import hanlp # Pipeline allows to blend multiple callable functions no matter they are a rule, a TensorFlow component or a PyTorch # one. However, it's slower than the MTL framework. pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ALBERT_BASE ) # In case both tf and torch are used, load tf first tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH) pipeline = hanlp.pipeline() \ .append(hanlp.utils.rules.split_sentence, output_key='sentences') \ .append(tok, output_key='tok') \ .append(pos, output_key='pos') doc = pipeline('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。') print(doc) doc.pretty_print()