Exemplo n.º 1
0
def result_format():
    HanLP.Config.ShowTermNature = False
    seg = HanLP.newSegment()
    print(seg.seg(sentences[0]))
    HanLP.Config.ShowTermNature = True
    seg = HanLP.newSegment()
    term_list = seg.seg(sentences[0])
    print(term_list)
    print([str(i.word) for i in term_list])
    print([str(i.nature) for i in term_list])
Exemplo n.º 2
0
def hanlp_recognize(text):

    # segment = HanLP.newSegment().enableNameRecognize(True)
    # segment = HanLP.newSegment().enableTranslatedNameRecognize(True)
    # segment = HanLP.newSegment().enablePlaceRecognize(True)
    segment = HanLP.newSegment().enableOrganizationRecognize(True)
    term_list = segment.seg(text)
    print(term_list)
Exemplo n.º 3
0
 def test_custom_dict_forcing(self):
     segment = HanLP.newSegment('viterbi')
     CustomDictionary.insert('川普', 'nr 1')
     self.assertIn('四川/ns, 普通人/n, 与/cc, 川/b, 普通/a, 电话/n',
                   segment.seg('四川普通人与川普通电话').__str__())
     segment.enableCustomDictionaryForcing(True)
     self.assertIn('四川/ns, 普通人/n, 与/cc, 川普/nr, 通电话/vi',
                   segment.seg('四川普通人与川普通电话').__str__())
Exemplo n.º 4
0
def raw_seg():
    """
    newSegment()支持下列多种模式,默认使用viterbi
    维特比 (viterbi):效率和效果的最佳平衡。也是最短路分词,HanLP最短路求解采用Viterbi算法
    双数组trie树 (dat):极速词典分词,千万字符每秒(可能无法获取词性,此处取决于你的词典)
    条件随机场 (crf):分词、词性标注与命名实体识别精度都较高,适合要求较高的NLP任务
    感知机 (perceptron):分词、词性标注与命名实体识别,支持在线学习
    N最短路 (nshort):命名实体识别稍微好一些,牺牲了速度
    """
    seg = HanLP.newSegment()
    for st in sentences:
        print(seg.seg(st))

    seg_crf = HanLP.newSegment("crf")
    for st in sentences:
        print(seg_crf.seg(st))
    """
Exemplo n.º 5
0
def number_recognition():
    # 演示数词与数量词识别
    sentences = [
        "十九元套餐包括什么",
        "九千九百九十九朵玫瑰",
        "壹佰块都不给我",
        "9012345678只蚂蚁",
        "牛奶三〇〇克*2",
        "ChinaJoy“扫黄”细则露胸超2厘米罚款",
    ]

    seg = HanLP.newSegment().enableNumberQuantifierRecognize(True)

    print("\n========== 演示数词与数量词 开启 ==========\n")
    for st in sentences:
        print(seg.seg(st))
    print("\n========== 演示数词与数量词 默认未开启 ==========\n")
    print(HanLP.newSegment().seg(sentences[0]))
Exemplo n.º 6
0
def cut_words_hanlp(rumor_text):
    # 加载停用词表
    stopwords = [
        line.strip()
        for line in open('search_stopwords.txt', encoding='UTF-8').readlines()
    ]
    # 去除空格
    rumor_text = rumor_text.strip()

    # 调用hanlp进行词性分析
    segment = HanLP.newSegment().enablePlaceRecognize(True)
    cut_rumors = segment.seg(rumor_text)

    out_list = []
    for item in cut_rumors:
        if item.word not in stopwords:
            item = str(item).split('/')
            if item[1] == 'ns':
                if item[0] != '\t' and item[0] != '钟南山':
                    out_list.append(item[0])

    location_list = []
    for location in out_list:
        # 提取省
        if '省' in location:
            location_list.append(location.replace('省', ''))
        elif location in province_list:
            location_list.append(location)
        # 提取直辖市
        elif '市' in location:
            if '北京' in location:
                location_list.append('北京')
            elif '上海' in location:
                location_list.append('上海')
            elif '重庆' in location:
                location_list.append('重庆')
            elif '天津' in location:
                location_list.append('天津')
            else:
                location_list.append(location)
        # 提取各市
        else:
            location_list.append(location + '市')

    return (location_list)
Exemplo n.º 7
0
def newSegment(algorithm="viterbi"):
    """
    * 创建一个分词器,
     * 这是一个工厂方法<br>
     *
     * @param algorithm 分词算法,传入算法的中英文名都可以,可选列表:<br>
     *                  <ul>
     *                  <li>维特比 (viterbi):效率和效果的最佳平衡</li>
     *                  <li>双数组trie树 (dat):极速词典分词,千万字符每秒</li>
     *                  <li>条件随机场 (crf):分词、词性标注与命名实体识别精度都较高,适合要求较高的NLP任务</li>
     *                  <li>感知机 (perceptron):分词、词性标注与命名实体识别,支持在线学习</li>
     *                  <li>N最短路 (nshort):命名实体识别稍微好一些,牺牲了速度</li>
     *                  <li>2阶隐马 (hmm2):训练速度较CRF快</li>
     *                  </ul>
     * @return 一个分词器

    """
    return HanLP.newSegment(algorithm)
Exemplo n.º 8
0
def enable_seg():
    seg = HanLP.newSegment()

    # 中文人名识别
    seg = seg.enableNameRecognize(True)

    # 音译人名识别
    seg = seg.enableTranslatedNameRecognize(True)

    # 日语人名识别
    seg = seg.enableJapaneseNameRecognize(True)

    # 地名识别
    seg = seg.enablePlaceRecognize(True)

    # 机构名识别
    seg = seg.enableOrganizationRecognize(True)

    for st in sentences:
        print(seg.seg(st))
Exemplo n.º 9
0
def process(s):
    """
    预处理,分词后去停用词
    :param s: str
    :return: eg: '今天 天气 很好'
    """
    s = s.replace('\t', '').replace('\n', '').replace("\u200b", "").strip()
    segment = HanLP.newSegment() \
        .enablePlaceRecognize(True) \
        .enableCustomDictionary(True) \
        .enableOrganizationRecognize(True) \
        .enableNameRecognize(True)
    hanlp_result = segment.seg(s)
    word_list = [i.word for i in hanlp_result]
    nature_list = [i.nature for i in hanlp_result]

    sss = [
        word_list[i] + "/" + str(nature_list[i]) for i in range(len(word_list))
        if word_list[i] not in stop_words
    ]
    res = " ".join(sss)
    if not res.strip():
        res = "none/none"
    return res
Exemplo n.º 10
0
#!/usr/bin/python3
# encoding: utf-8
# Author MrYx
# @Time: 2019/5/6 16:53

import json, re, xlrd, datetime, sys
import jieba.posseg as pseg
import io
import sys
import cpca

# import pymysql
from pyhanlp import HanLP

# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
NER = HanLP.newSegment().enableNameRecognize(True)
import pymysql
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')


class EntityExtraction():
    def __init__(self):
        '''
        这里analys_pos定义每个抽取任务从excel的哪个部分抽取
        '''
        self.input_file = 'OpenLaw判决书.xlsx'
        self.result_file = 'caseinfo.json'
        self.all_items = self.xlsx_to_dict(self.input_file)
        self.analys_pos = {
            'extract_ajxz': ['案由'],  #可能从多个不同部分提取,所以用List
            'extract_ajlx': ['当事人', '判决结果', '庭审过程'],
Exemplo n.º 11
0
def main():
    if len(sys.argv) == 1:
        sys.argv.append('--help')

    arg_parser = argparse.ArgumentParser(
        description='HanLP: Han Language Processing v{}'.format(
            HANLP_JAR_VERSION))
    arg_parser.add_argument('-v',
                            '--version',
                            required=False,
                            action='store_true',
                            help='show installed versions of HanLP')
    task_parser = arg_parser.add_subparsers(dest="task",
                                            help='which task to perform?')
    segment_parser = task_parser.add_parser(name='segment',
                                            help='word segmentation')
    tag_parser = segment_parser.add_mutually_exclusive_group(required=False)
    tag_parser.add_argument('--tag',
                            dest='tag',
                            action='store_true',
                            help='show part-of-speech tags')
    tag_parser.add_argument('--no-tag',
                            dest='tag',
                            action='store_false',
                            help='don\'t show part-of-speech tags')
    segment_parser.set_defaults(tag=True)
    segment_parser.add_argument(
        '-a',
        '--algorithm',
        type=str,
        default='viterbi',
        help='algorithm of segmentation e.g. perceptron')
    parse_parser = task_parser.add_parser(name='parse',
                                          help='dependency parsing')
    parse_keyword = task_parser.add_parser(name='keyword',
                                           help='dependency Keyword')
    parse_summary = task_parser.add_parser(name='summary',
                                           help='dependency summary')
    server_parser = task_parser.add_parser(
        name='serve',
        help='start http server',
        description='A http server for HanLP')
    server_parser.add_argument('--port', type=int, default=8765)
    update_parser = task_parser.add_parser(name='update',
                                           help='update jar and data of HanLP')

    def add_args(p):
        p.add_argument("--config",
                       default=PATH_CONFIG,
                       help='path to hanlp.properties')
        # p.add_argument("--action", dest="action", default='predict',
        #                help='Which action (train, test, predict)?')

    add_args(segment_parser)
    add_args(parse_parser)
    add_args(parse_keyword)
    add_args(parse_summary)

    if '-v' in sys.argv or '--version' in sys.argv:
        print('jar  {}: {}'.format(HANLP_JAR_VERSION, HANLP_JAR_PATH))
        data_version = hanlp_installed_data_version()
        print('data {}: {}'.format(data_version if data_version else '自定义',
                                   HANLP_DATA_PATH))
        print('config    : {}'.format(
            os.path.join(STATIC_ROOT, 'hanlp.properties')))
        exit(0)

    args = arg_parser.parse_args()

    def eprint(*args, **kwargs):
        print(*args, file=sys.stderr, **kwargs)

    def die(msg):
        eprint(msg)
        exit(1)

    if hasattr(args, 'config') and args.config:
        if os.path.isfile(args.config):
            JClass('com.hankcs.hanlp.utility.Predefine'
                   ).HANLP_PROPERTIES_PATH = args.config
        else:
            die('Can\'t find config file {}'.format(args.config))

    if args.task == 'segment':
        segmenter = None
        try:
            segmenter = HanLP.newSegment(args.algorithm)
        except JException as e:
            if isinstance(e, java.lang.IllegalArgumentException):
                die('invalid algorithm {}'.format(args.algorithm))
            elif isinstance(e, java.lang.RuntimeException):
                die('failed to load required model')
            else:
                die('unknown exception {}'.format(repr(e)))

        is_lexical_analyzer = hasattr(segmenter, 'analyze')
        if not args.tag:
            if is_lexical_analyzer:
                segmenter.enablePartOfSpeechTagging(False)
                JClass('com.hankcs.hanlp.HanLP$Config').ShowTermNature = False
            else:
                JClass('com.hankcs.hanlp.HanLP$Config').ShowTermNature = False
        for line in sys.stdin:
            line = line.strip()
            print(' '.join(term.toString()
                           for term in segmenter.seg(any2utf8(line))))
    elif args.task == 'parse':
        for line in sys.stdin:
            line = line.strip()
            print(HanLP.parseDependency(any2utf8(line)))
    elif args.task == 'keyword':
        for line in sys.stdin:
            line = line.strip()
            TextRankKeyword = JClass(
                "com.hankcs.hanlp.summary.TextRankKeyword")
            keyword_list = HanLP.extractKeyword(line, 3)
            print(keyword_list)
            #print(HanLP.parseDependency(any2utf8(line)))
    elif args.task == 'summary':
        for line in sys.stdin:
            line = line.strip()
            TextRankSentence = JClass(
                "com.hankcs.hanlp.summary.TextRankSentence")
            sentence_list = HanLP.extractSummary(line, 3)
            print(sentence_list)
    elif args.task == 'serve':
        if PY == 3:
            from pyhanlp import server
            server.run(port=args.port)
        else:
            die('现在server.py暂时不支持Python2,欢迎参与移植')
    elif args.task == 'update':
        if hanlp_installed_data_version() == '手动安装':
            die('手动配置不支持自动升级,若要恢复自动安装,请清除HANLP相关环境变量')
        else:
            from pyhanlp.static import update_hanlp
            update_hanlp()
Exemplo n.º 12
0
# #coding=utf=8
#
from pyhanlp import HanLP
import time

# HanLP.Config.ShowTermNature = False
text = "小区居民有的反对喂养流浪猫"
# CRFnewSegment = HanLP.newSegment("crf")
# term_list = CRFnewSegment.seg(text)
# print(term_list)

HanLP.Config.ShowTermNature = True
start_time = time.time()
CRFnewSegment = HanLP.newSegment("crf")
end_time = time.time()
term_list = CRFnewSegment.seg(text)
print(term_list)
print('分词+词性标注 Took %f second' % (end_time - start_time))
# print([str(i.word) for i in term_list])
# print([str(i.nature) for i in term_list])

start_time = time.time()
seg_result = HanLP.segment("不要")
end_time = time.time()
print(' '.join('%s/%s' % (term.word, term.nature) for term in seg_result))
print('分词+词性标注 Took %f second' % (end_time - start_time))

# 依存分析
start_time = time.time()
sentence = HanLP.parseDependency('万有引力是什么')
end_time = time.time()
Exemplo n.º 13
0
import os, sys

src_path = os.path.abspath(
    os.path.join(os.path.dirname(os.path.abspath(__file__)), "../src"))
sys.path.append(f'{src_path}')

from soundshapecode import ssc
from soundshapecode.variant_kmp import VatiantKMP

SIMILARITY_THRESHOLD = 0.8
SSC_ENCODE_WAY = 'SOUND'  #'ALL','SOUND','SHAPE'

load_phrases_dict({'沌口': [['zhuàn'], ['kǒu']]})

if __name__ == "__main__":
    analyzer = HanLP.newSegment('perceptron')

    chi_word1 = '沌口'
    chi_word2 = '我住在钻口'
    ssc.getHanziStrokesDict()
    ssc.getHanziStructureDict()

    chi_word1_ssc = ssc.getSSC_sentence(chi_word1, SSC_ENCODE_WAY, analyzer)
    print(chi_word1_ssc)

    chi_word2_ssc = ssc.getSSC_sentence(chi_word2, SSC_ENCODE_WAY, analyzer)
    print(chi_word2_ssc)

    #应用串的模式匹配KMP算法,找变异词。效率比BF算法高
    kmp = VatiantKMP(SIMILARITY_THRESHOLD)
    kmp.indexKMP(chi_word2_ssc, chi_word1_ssc, SSC_ENCODE_WAY)  #主串S、模式串T
Exemplo n.º 14
0
def init_hanlp():
    segment = HanLP.newSegment().enableNameRecognize(
        True).enableOrganizationRecognize(True).enablePlaceRecognize(
            True).enableCustomDictionaryForcing(True)
    return segment