def result_format(): HanLP.Config.ShowTermNature = False seg = HanLP.newSegment() print(seg.seg(sentences[0])) HanLP.Config.ShowTermNature = True seg = HanLP.newSegment() term_list = seg.seg(sentences[0]) print(term_list) print([str(i.word) for i in term_list]) print([str(i.nature) for i in term_list])
def hanlp_recognize(text): # segment = HanLP.newSegment().enableNameRecognize(True) # segment = HanLP.newSegment().enableTranslatedNameRecognize(True) # segment = HanLP.newSegment().enablePlaceRecognize(True) segment = HanLP.newSegment().enableOrganizationRecognize(True) term_list = segment.seg(text) print(term_list)
def test_custom_dict_forcing(self): segment = HanLP.newSegment('viterbi') CustomDictionary.insert('川普', 'nr 1') self.assertIn('四川/ns, 普通人/n, 与/cc, 川/b, 普通/a, 电话/n', segment.seg('四川普通人与川普通电话').__str__()) segment.enableCustomDictionaryForcing(True) self.assertIn('四川/ns, 普通人/n, 与/cc, 川普/nr, 通电话/vi', segment.seg('四川普通人与川普通电话').__str__())
def raw_seg(): """ newSegment()支持下列多种模式,默认使用viterbi 维特比 (viterbi):效率和效果的最佳平衡。也是最短路分词,HanLP最短路求解采用Viterbi算法 双数组trie树 (dat):极速词典分词,千万字符每秒(可能无法获取词性,此处取决于你的词典) 条件随机场 (crf):分词、词性标注与命名实体识别精度都较高,适合要求较高的NLP任务 感知机 (perceptron):分词、词性标注与命名实体识别,支持在线学习 N最短路 (nshort):命名实体识别稍微好一些,牺牲了速度 """ seg = HanLP.newSegment() for st in sentences: print(seg.seg(st)) seg_crf = HanLP.newSegment("crf") for st in sentences: print(seg_crf.seg(st)) """
def number_recognition(): # 演示数词与数量词识别 sentences = [ "十九元套餐包括什么", "九千九百九十九朵玫瑰", "壹佰块都不给我", "9012345678只蚂蚁", "牛奶三〇〇克*2", "ChinaJoy“扫黄”细则露胸超2厘米罚款", ] seg = HanLP.newSegment().enableNumberQuantifierRecognize(True) print("\n========== 演示数词与数量词 开启 ==========\n") for st in sentences: print(seg.seg(st)) print("\n========== 演示数词与数量词 默认未开启 ==========\n") print(HanLP.newSegment().seg(sentences[0]))
def cut_words_hanlp(rumor_text): # 加载停用词表 stopwords = [ line.strip() for line in open('search_stopwords.txt', encoding='UTF-8').readlines() ] # 去除空格 rumor_text = rumor_text.strip() # 调用hanlp进行词性分析 segment = HanLP.newSegment().enablePlaceRecognize(True) cut_rumors = segment.seg(rumor_text) out_list = [] for item in cut_rumors: if item.word not in stopwords: item = str(item).split('/') if item[1] == 'ns': if item[0] != '\t' and item[0] != '钟南山': out_list.append(item[0]) location_list = [] for location in out_list: # 提取省 if '省' in location: location_list.append(location.replace('省', '')) elif location in province_list: location_list.append(location) # 提取直辖市 elif '市' in location: if '北京' in location: location_list.append('北京') elif '上海' in location: location_list.append('上海') elif '重庆' in location: location_list.append('重庆') elif '天津' in location: location_list.append('天津') else: location_list.append(location) # 提取各市 else: location_list.append(location + '市') return (location_list)
def newSegment(algorithm="viterbi"): """ * 创建一个分词器, * 这是一个工厂方法<br> * * @param algorithm 分词算法,传入算法的中英文名都可以,可选列表:<br> * <ul> * <li>维特比 (viterbi):效率和效果的最佳平衡</li> * <li>双数组trie树 (dat):极速词典分词,千万字符每秒</li> * <li>条件随机场 (crf):分词、词性标注与命名实体识别精度都较高,适合要求较高的NLP任务</li> * <li>感知机 (perceptron):分词、词性标注与命名实体识别,支持在线学习</li> * <li>N最短路 (nshort):命名实体识别稍微好一些,牺牲了速度</li> * <li>2阶隐马 (hmm2):训练速度较CRF快</li> * </ul> * @return 一个分词器 """ return HanLP.newSegment(algorithm)
def enable_seg(): seg = HanLP.newSegment() # 中文人名识别 seg = seg.enableNameRecognize(True) # 音译人名识别 seg = seg.enableTranslatedNameRecognize(True) # 日语人名识别 seg = seg.enableJapaneseNameRecognize(True) # 地名识别 seg = seg.enablePlaceRecognize(True) # 机构名识别 seg = seg.enableOrganizationRecognize(True) for st in sentences: print(seg.seg(st))
def process(s): """ 预处理,分词后去停用词 :param s: str :return: eg: '今天 天气 很好' """ s = s.replace('\t', '').replace('\n', '').replace("\u200b", "").strip() segment = HanLP.newSegment() \ .enablePlaceRecognize(True) \ .enableCustomDictionary(True) \ .enableOrganizationRecognize(True) \ .enableNameRecognize(True) hanlp_result = segment.seg(s) word_list = [i.word for i in hanlp_result] nature_list = [i.nature for i in hanlp_result] sss = [ word_list[i] + "/" + str(nature_list[i]) for i in range(len(word_list)) if word_list[i] not in stop_words ] res = " ".join(sss) if not res.strip(): res = "none/none" return res
#!/usr/bin/python3 # encoding: utf-8 # Author MrYx # @Time: 2019/5/6 16:53 import json, re, xlrd, datetime, sys import jieba.posseg as pseg import io import sys import cpca # import pymysql from pyhanlp import HanLP # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8') NER = HanLP.newSegment().enableNameRecognize(True) import pymysql sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8') class EntityExtraction(): def __init__(self): ''' 这里analys_pos定义每个抽取任务从excel的哪个部分抽取 ''' self.input_file = 'OpenLaw判决书.xlsx' self.result_file = 'caseinfo.json' self.all_items = self.xlsx_to_dict(self.input_file) self.analys_pos = { 'extract_ajxz': ['案由'], #可能从多个不同部分提取,所以用List 'extract_ajlx': ['当事人', '判决结果', '庭审过程'],
def main(): if len(sys.argv) == 1: sys.argv.append('--help') arg_parser = argparse.ArgumentParser( description='HanLP: Han Language Processing v{}'.format( HANLP_JAR_VERSION)) arg_parser.add_argument('-v', '--version', required=False, action='store_true', help='show installed versions of HanLP') task_parser = arg_parser.add_subparsers(dest="task", help='which task to perform?') segment_parser = task_parser.add_parser(name='segment', help='word segmentation') tag_parser = segment_parser.add_mutually_exclusive_group(required=False) tag_parser.add_argument('--tag', dest='tag', action='store_true', help='show part-of-speech tags') tag_parser.add_argument('--no-tag', dest='tag', action='store_false', help='don\'t show part-of-speech tags') segment_parser.set_defaults(tag=True) segment_parser.add_argument( '-a', '--algorithm', type=str, default='viterbi', help='algorithm of segmentation e.g. perceptron') parse_parser = task_parser.add_parser(name='parse', help='dependency parsing') parse_keyword = task_parser.add_parser(name='keyword', help='dependency Keyword') parse_summary = task_parser.add_parser(name='summary', help='dependency summary') server_parser = task_parser.add_parser( name='serve', help='start http server', description='A http server for HanLP') server_parser.add_argument('--port', type=int, default=8765) update_parser = task_parser.add_parser(name='update', help='update jar and data of HanLP') def add_args(p): p.add_argument("--config", default=PATH_CONFIG, help='path to hanlp.properties') # p.add_argument("--action", dest="action", default='predict', # help='Which action (train, test, predict)?') add_args(segment_parser) add_args(parse_parser) add_args(parse_keyword) add_args(parse_summary) if '-v' in sys.argv or '--version' in sys.argv: print('jar {}: {}'.format(HANLP_JAR_VERSION, HANLP_JAR_PATH)) data_version = hanlp_installed_data_version() print('data {}: {}'.format(data_version if data_version else '自定义', HANLP_DATA_PATH)) print('config : {}'.format( os.path.join(STATIC_ROOT, 'hanlp.properties'))) exit(0) args = arg_parser.parse_args() def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) def die(msg): eprint(msg) exit(1) if hasattr(args, 'config') and args.config: if os.path.isfile(args.config): JClass('com.hankcs.hanlp.utility.Predefine' ).HANLP_PROPERTIES_PATH = args.config else: die('Can\'t find config file {}'.format(args.config)) if args.task == 'segment': segmenter = None try: segmenter = HanLP.newSegment(args.algorithm) except JException as e: if isinstance(e, java.lang.IllegalArgumentException): die('invalid algorithm {}'.format(args.algorithm)) elif isinstance(e, java.lang.RuntimeException): die('failed to load required model') else: die('unknown exception {}'.format(repr(e))) is_lexical_analyzer = hasattr(segmenter, 'analyze') if not args.tag: if is_lexical_analyzer: segmenter.enablePartOfSpeechTagging(False) JClass('com.hankcs.hanlp.HanLP$Config').ShowTermNature = False else: JClass('com.hankcs.hanlp.HanLP$Config').ShowTermNature = False for line in sys.stdin: line = line.strip() print(' '.join(term.toString() for term in segmenter.seg(any2utf8(line)))) elif args.task == 'parse': for line in sys.stdin: line = line.strip() print(HanLP.parseDependency(any2utf8(line))) elif args.task == 'keyword': for line in sys.stdin: line = line.strip() TextRankKeyword = JClass( "com.hankcs.hanlp.summary.TextRankKeyword") keyword_list = HanLP.extractKeyword(line, 3) print(keyword_list) #print(HanLP.parseDependency(any2utf8(line))) elif args.task == 'summary': for line in sys.stdin: line = line.strip() TextRankSentence = JClass( "com.hankcs.hanlp.summary.TextRankSentence") sentence_list = HanLP.extractSummary(line, 3) print(sentence_list) elif args.task == 'serve': if PY == 3: from pyhanlp import server server.run(port=args.port) else: die('现在server.py暂时不支持Python2,欢迎参与移植') elif args.task == 'update': if hanlp_installed_data_version() == '手动安装': die('手动配置不支持自动升级,若要恢复自动安装,请清除HANLP相关环境变量') else: from pyhanlp.static import update_hanlp update_hanlp()
# #coding=utf=8 # from pyhanlp import HanLP import time # HanLP.Config.ShowTermNature = False text = "小区居民有的反对喂养流浪猫" # CRFnewSegment = HanLP.newSegment("crf") # term_list = CRFnewSegment.seg(text) # print(term_list) HanLP.Config.ShowTermNature = True start_time = time.time() CRFnewSegment = HanLP.newSegment("crf") end_time = time.time() term_list = CRFnewSegment.seg(text) print(term_list) print('分词+词性标注 Took %f second' % (end_time - start_time)) # print([str(i.word) for i in term_list]) # print([str(i.nature) for i in term_list]) start_time = time.time() seg_result = HanLP.segment("不要") end_time = time.time() print(' '.join('%s/%s' % (term.word, term.nature) for term in seg_result)) print('分词+词性标注 Took %f second' % (end_time - start_time)) # 依存分析 start_time = time.time() sentence = HanLP.parseDependency('万有引力是什么') end_time = time.time()
import os, sys src_path = os.path.abspath( os.path.join(os.path.dirname(os.path.abspath(__file__)), "../src")) sys.path.append(f'{src_path}') from soundshapecode import ssc from soundshapecode.variant_kmp import VatiantKMP SIMILARITY_THRESHOLD = 0.8 SSC_ENCODE_WAY = 'SOUND' #'ALL','SOUND','SHAPE' load_phrases_dict({'沌口': [['zhuàn'], ['kǒu']]}) if __name__ == "__main__": analyzer = HanLP.newSegment('perceptron') chi_word1 = '沌口' chi_word2 = '我住在钻口' ssc.getHanziStrokesDict() ssc.getHanziStructureDict() chi_word1_ssc = ssc.getSSC_sentence(chi_word1, SSC_ENCODE_WAY, analyzer) print(chi_word1_ssc) chi_word2_ssc = ssc.getSSC_sentence(chi_word2, SSC_ENCODE_WAY, analyzer) print(chi_word2_ssc) #应用串的模式匹配KMP算法,找变异词。效率比BF算法高 kmp = VatiantKMP(SIMILARITY_THRESHOLD) kmp.indexKMP(chi_word2_ssc, chi_word1_ssc, SSC_ENCODE_WAY) #主串S、模式串T
def init_hanlp(): segment = HanLP.newSegment().enableNameRecognize( True).enableOrganizationRecognize(True).enablePlaceRecognize( True).enableCustomDictionaryForcing(True) return segment