예제 #1
0
    def split_test(self, sentence):
        #line = sentence.strip().decode('utf-8', 'ignore')  # 去除每行首尾可能出现的空格,并转为Unicode进行处理
        #line1 = re.sub("[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……&*()]+".decode("utf8"),
        #               " ".decode("utf8"), line)
        #wordList = list(jieba.cut(line1))  # 用结巴分词,对每行内容进行分词

        print(HanLP.segment('你好,欢迎在Python中调用HanLP的API'))
        for term in HanLP.segment('下雨天地面积水'):
            print('{}\t{}'.format(term.word, term.nature))  # 获取单词与词性
        testCases = [
            "商品和服务", "结婚的和尚未结婚的确实在干扰分词啊", "买水果然后来世博园最后去世博会", "中国的首都是北京",
            "欢迎新老师生前来就餐", "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
            "随着页游兴起到现在的页游繁盛,依赖于存档进行逻辑判断的设计减少了,但这块也不能完全忽略掉。"
        ]
        for sentence in testCases:
            print(HanLP.segment(sentence))
        # 关键词提取
        document = "水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露," \
                   "根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标," \
                   "有部分省超过红线的指标。对一些超过红线的地方,陈明忠表示,对一些取用水项目进行区域的限批," \
                   "严格地进行水资源论证和取水许可的批准。"
        print(HanLP.extractKeyword(document, 2))
        # 自动摘要
        print(HanLP.extractSummary(document, 3))
        # 依存句法分析
        print(HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。"))
예제 #2
0
def show_words():
    sql = 'SELECT * FROM NEWSWB'
    lock.acquire()
    cursor.execute(sql)
    lock.release()
    news = cursor.fetchone()
    print(news[5], '>>>>>>>', HanLP.extractKeyword(news[5], 5))
예제 #3
0
def get_keyword(content,keynum=2):
    """
    获取每个问题中的关键字,关键词的数目由keynum控制
    :param content: 一个句子
    :return:
    """
    keywordList = HanLP.extractKeyword(content,keynum)
    return keywordList
예제 #4
0
파일: api.py 프로젝트: wisonwang/pyhanlp
def extractKeyword(document, size):
    """
    * 提取关键词
     *
     * @param document 文档内容
     * @param size     希望提取几个关键词
     * @return 一个列表
    """
    return HanLP.extractKeyword(document, size)
예제 #5
0
def get_keywords():
    try:
        sql = 'SELECT * FROM NEWSWB'
        lock.acquire()
        cursor.execute(sql)
        lock.release()
        news = cursor.fetchall()
        for n in news:
            insert_keywords_into_mysql(HanLP.extractKeyword(n[5], 5), n[0])
            print(n[0], ' finish')
    except IOError as err:
        print(err)
예제 #6
0
 def extractKeyword(self, sent, num=2):
     """
     抽取关键词
     :param sent: 
     :return: 
     """
     res = HanLP.extractKeyword(sent, num)
     res_list = []
     for word in res:
         if self.stopwords.__contains__(word):
             continue
         else:
             res_list.append(word)
     return res_list
def siglerow(text, keyword_num=1):
    text = text[text.find(':') + 1:]
    text = ','.join(re.compile(r'[\u4e00-\u9fa5]+').findall(text))
    kw1 = analyse.textrank(text,keyword_num,allowPOS=('ns','n','vn','b'))
    kw2 = HanLP.extractKeyword(text,1)

    k_words = HanLP.segment(text)
    kw3 = ''
    for i in k_words:
        if str(i.nature)[0] == 'n':
            kw3 = str(i.word)
            break
    kw4 = analyse.textrank(text,keyword_num)

    if kw1:
        return str(kw1[0])  # jieba关键词,仅限于地点,名词,动名词
    elif kw2:
        return str(kw2[0]) # hanlp关键词
    elif kw3:
        return kw3 # 仅限于地点,名词,动名词
    elif kw4:
        return str(kw4[0])
    else:
        return ''
예제 #8
0
from pyhanlp import *

print(HanLP.segment('你好,欢迎在Python中调用HanLP的API'))
for term in HanLP.segment('下雨天地面积水'):
    print('{}\t{}'.format(term.word, term.nature))  # 获取单词与词性
testCases = [
    "商品和服务", "结婚的和尚未结婚的确实在干扰分词啊", "买水果然后来世博园最后去世博会", "中国的首都是北京", "欢迎新老师生前来就餐",
    "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
    "随着页游兴起到现在的页游繁盛,依赖于存档进行逻辑判断的设计减少了,但这块也不能完全忽略掉。"
]
for sentence in testCases:
    print(HanLP.segment(sentence))
print("# 关键词提取")
document = "水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露," \
           "根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标," \
           "有部分省超过红线的指标。对一些超过红线的地方,陈明忠表示,对一些取用水项目进行区域的限批," \
           "严格地进行水资源论证和取水许可的批准。"
print(HanLP.extractKeyword(document, 2))
print("# 自动摘要")
print(HanLP.extractSummary(document, 3))
print("# 依存句法分析")
print(HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。"))

doc = "句法分析是自然语言处理中的关键技术之一,其基本任务是确定句子的句法结构或者句子中词汇之间的依存关系。\
主要包括两方面的内容,一是确定语言的语法体系,即对语言中合法的句子的语法结构给与形式化的定义;另一方面是句法分析技术,即根据给定的语法体系,自动推导出句子的句法结构,分析句子所包含的句法单位和这些句法单位之间的关系。"

print("关键词")
print(HanLP.extractKeyword(doc, 2))
print("# 自动摘要")
print(HanLP.extractSummary(doc, 3))
예제 #9
0
def extract_keyword(item):
    words=HanLP.extractKeyword(item,1)
    if(str(words)=='[]'):
        return item
    else :
        return str(words[0])
예제 #10
0
def main():
    if len(sys.argv) == 1:
        sys.argv.append('--help')

    arg_parser = argparse.ArgumentParser(
        description='HanLP: Han Language Processing v{}'.format(
            HANLP_JAR_VERSION))
    arg_parser.add_argument('-v',
                            '--version',
                            required=False,
                            action='store_true',
                            help='show installed versions of HanLP')
    task_parser = arg_parser.add_subparsers(dest="task",
                                            help='which task to perform?')
    segment_parser = task_parser.add_parser(name='segment',
                                            help='word segmentation')
    tag_parser = segment_parser.add_mutually_exclusive_group(required=False)
    tag_parser.add_argument('--tag',
                            dest='tag',
                            action='store_true',
                            help='show part-of-speech tags')
    tag_parser.add_argument('--no-tag',
                            dest='tag',
                            action='store_false',
                            help='don\'t show part-of-speech tags')
    segment_parser.set_defaults(tag=True)
    segment_parser.add_argument(
        '-a',
        '--algorithm',
        type=str,
        default='viterbi',
        help='algorithm of segmentation e.g. perceptron')
    parse_parser = task_parser.add_parser(name='parse',
                                          help='dependency parsing')
    parse_keyword = task_parser.add_parser(name='keyword',
                                           help='dependency Keyword')
    parse_summary = task_parser.add_parser(name='summary',
                                           help='dependency summary')
    server_parser = task_parser.add_parser(
        name='serve',
        help='start http server',
        description='A http server for HanLP')
    server_parser.add_argument('--port', type=int, default=8765)
    update_parser = task_parser.add_parser(name='update',
                                           help='update jar and data of HanLP')

    def add_args(p):
        p.add_argument("--config",
                       default=PATH_CONFIG,
                       help='path to hanlp.properties')
        # p.add_argument("--action", dest="action", default='predict',
        #                help='Which action (train, test, predict)?')

    add_args(segment_parser)
    add_args(parse_parser)
    add_args(parse_keyword)
    add_args(parse_summary)

    if '-v' in sys.argv or '--version' in sys.argv:
        print('jar  {}: {}'.format(HANLP_JAR_VERSION, HANLP_JAR_PATH))
        data_version = hanlp_installed_data_version()
        print('data {}: {}'.format(data_version if data_version else '自定义',
                                   HANLP_DATA_PATH))
        print('config    : {}'.format(
            os.path.join(STATIC_ROOT, 'hanlp.properties')))
        exit(0)

    args = arg_parser.parse_args()

    def eprint(*args, **kwargs):
        print(*args, file=sys.stderr, **kwargs)

    def die(msg):
        eprint(msg)
        exit(1)

    if hasattr(args, 'config') and args.config:
        if os.path.isfile(args.config):
            JClass('com.hankcs.hanlp.utility.Predefine'
                   ).HANLP_PROPERTIES_PATH = args.config
        else:
            die('Can\'t find config file {}'.format(args.config))

    if args.task == 'segment':
        segmenter = None
        try:
            segmenter = HanLP.newSegment(args.algorithm)
        except JException as e:
            if isinstance(e, java.lang.IllegalArgumentException):
                die('invalid algorithm {}'.format(args.algorithm))
            elif isinstance(e, java.lang.RuntimeException):
                die('failed to load required model')
            else:
                die('unknown exception {}'.format(repr(e)))

        is_lexical_analyzer = hasattr(segmenter, 'analyze')
        if not args.tag:
            if is_lexical_analyzer:
                segmenter.enablePartOfSpeechTagging(False)
                JClass('com.hankcs.hanlp.HanLP$Config').ShowTermNature = False
            else:
                JClass('com.hankcs.hanlp.HanLP$Config').ShowTermNature = False
        for line in sys.stdin:
            line = line.strip()
            print(' '.join(term.toString()
                           for term in segmenter.seg(any2utf8(line))))
    elif args.task == 'parse':
        for line in sys.stdin:
            line = line.strip()
            print(HanLP.parseDependency(any2utf8(line)))
    elif args.task == 'keyword':
        for line in sys.stdin:
            line = line.strip()
            TextRankKeyword = JClass(
                "com.hankcs.hanlp.summary.TextRankKeyword")
            keyword_list = HanLP.extractKeyword(line, 3)
            print(keyword_list)
            #print(HanLP.parseDependency(any2utf8(line)))
    elif args.task == 'summary':
        for line in sys.stdin:
            line = line.strip()
            TextRankSentence = JClass(
                "com.hankcs.hanlp.summary.TextRankSentence")
            sentence_list = HanLP.extractSummary(line, 3)
            print(sentence_list)
    elif args.task == 'serve':
        if PY == 3:
            from pyhanlp import server
            server.run(port=args.port)
        else:
            die('现在server.py暂时不支持Python2,欢迎参与移植')
    elif args.task == 'update':
        if hanlp_installed_data_version() == '手动安装':
            die('手动配置不支持自动升级,若要恢复自动安装,请清除HANLP相关环境变量')
        else:
            from pyhanlp.static import update_hanlp
            update_hanlp()
예제 #11
0
from pyhanlp import HanLP

texts = "中国是一个文明古国,拥有56个民族,文化历史厚重。"

# 分词
word_cut = HanLP.segment(texts)

print("分词结果:\n", word_cut)

for term in word_cut:
    print("单词:%s;  词性:%s " % (term.word, term.nature))

testCases = [
    "商品和服务", "结婚的和尚未结婚的确实在干扰分词啊", "买水果然后来世博园最后去世博会", "中国的首都是北京", "欢迎新老师生前来就餐",
    "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
    "随着页游兴起到现在的页游繁盛,依赖于存档进行逻辑判断的设计减少了,但这块也不能完全忽略掉。"
]
# for sentence in testCases:
#     print(HanLP.segment(sentence))

# 关键词提取
document = "水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露," \
           "根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标," \
           "有部分省超过红线的指标。对一些超过红线的地方,陈明忠表示,对一些取用水项目进行区域的限批," \
           "严格地进行水资源论证和取水许可的批准。"
print(HanLP.extractKeyword(document, 5))

# 自动摘要
print(HanLP.extractSummary(document, 3))
# 依存句法分析
print(HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。"))
예제 #12
0
def hanlp_keyword(instr):
    # Text Rank
    return HanLP.extractKeyword(instr, 10)