Пример #1
0
def extractSummary(document, size, sentence_separator=None):
    """
      * 自动摘要
     *
     * @param document           目标文档
     * @param size               需要的关键句的个数
     * @param sentence_separator 分割目标文档时的句子分割符,正则格式, 如:[。??!!;;]
     * @return 关键句列表
    """
    if sentence_separator:
        return HanLP.extractSummary(document, size, sentence_separator)
    else:
        return HanLP.extractSummary(document, size)
Пример #2
0
    def split_test(self, sentence):
        #line = sentence.strip().decode('utf-8', 'ignore')  # 去除每行首尾可能出现的空格,并转为Unicode进行处理
        #line1 = re.sub("[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……&*()]+".decode("utf8"),
        #               " ".decode("utf8"), line)
        #wordList = list(jieba.cut(line1))  # 用结巴分词,对每行内容进行分词

        print(HanLP.segment('你好,欢迎在Python中调用HanLP的API'))
        for term in HanLP.segment('下雨天地面积水'):
            print('{}\t{}'.format(term.word, term.nature))  # 获取单词与词性
        testCases = [
            "商品和服务", "结婚的和尚未结婚的确实在干扰分词啊", "买水果然后来世博园最后去世博会", "中国的首都是北京",
            "欢迎新老师生前来就餐", "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
            "随着页游兴起到现在的页游繁盛,依赖于存档进行逻辑判断的设计减少了,但这块也不能完全忽略掉。"
        ]
        for sentence in testCases:
            print(HanLP.segment(sentence))
        # 关键词提取
        document = "水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露," \
                   "根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标," \
                   "有部分省超过红线的指标。对一些超过红线的地方,陈明忠表示,对一些取用水项目进行区域的限批," \
                   "严格地进行水资源论证和取水许可的批准。"
        print(HanLP.extractKeyword(document, 2))
        # 自动摘要
        print(HanLP.extractSummary(document, 3))
        # 依存句法分析
        print(HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。"))
Пример #3
0
from pyhanlp import *

print(HanLP.segment('你好,欢迎在Python中调用HanLP的API'))
for term in HanLP.segment('下雨天地面积水'):
    print('{}\t{}'.format(term.word, term.nature))  # 获取单词与词性
testCases = [
    "商品和服务", "结婚的和尚未结婚的确实在干扰分词啊", "买水果然后来世博园最后去世博会", "中国的首都是北京", "欢迎新老师生前来就餐",
    "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
    "随着页游兴起到现在的页游繁盛,依赖于存档进行逻辑判断的设计减少了,但这块也不能完全忽略掉。"
]
for sentence in testCases:
    print(HanLP.segment(sentence))
print("# 关键词提取")
document = "水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露," \
           "根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标," \
           "有部分省超过红线的指标。对一些超过红线的地方,陈明忠表示,对一些取用水项目进行区域的限批," \
           "严格地进行水资源论证和取水许可的批准。"
print(HanLP.extractKeyword(document, 2))
print("# 自动摘要")
print(HanLP.extractSummary(document, 3))
print("# 依存句法分析")
print(HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。"))

doc = "句法分析是自然语言处理中的关键技术之一,其基本任务是确定句子的句法结构或者句子中词汇之间的依存关系。\
主要包括两方面的内容,一是确定语言的语法体系,即对语言中合法的句子的语法结构给与形式化的定义;另一方面是句法分析技术,即根据给定的语法体系,自动推导出句子的句法结构,分析句子所包含的句法单位和这些句法单位之间的关系。"

print("关键词")
print(HanLP.extractKeyword(doc, 2))
print("# 自动摘要")
print(HanLP.extractSummary(doc, 3))
Пример #4
0
def main():
    if len(sys.argv) == 1:
        sys.argv.append('--help')

    arg_parser = argparse.ArgumentParser(
        description='HanLP: Han Language Processing v{}'.format(
            HANLP_JAR_VERSION))
    arg_parser.add_argument('-v',
                            '--version',
                            required=False,
                            action='store_true',
                            help='show installed versions of HanLP')
    task_parser = arg_parser.add_subparsers(dest="task",
                                            help='which task to perform?')
    segment_parser = task_parser.add_parser(name='segment',
                                            help='word segmentation')
    tag_parser = segment_parser.add_mutually_exclusive_group(required=False)
    tag_parser.add_argument('--tag',
                            dest='tag',
                            action='store_true',
                            help='show part-of-speech tags')
    tag_parser.add_argument('--no-tag',
                            dest='tag',
                            action='store_false',
                            help='don\'t show part-of-speech tags')
    segment_parser.set_defaults(tag=True)
    segment_parser.add_argument(
        '-a',
        '--algorithm',
        type=str,
        default='viterbi',
        help='algorithm of segmentation e.g. perceptron')
    parse_parser = task_parser.add_parser(name='parse',
                                          help='dependency parsing')
    parse_keyword = task_parser.add_parser(name='keyword',
                                           help='dependency Keyword')
    parse_summary = task_parser.add_parser(name='summary',
                                           help='dependency summary')
    server_parser = task_parser.add_parser(
        name='serve',
        help='start http server',
        description='A http server for HanLP')
    server_parser.add_argument('--port', type=int, default=8765)
    update_parser = task_parser.add_parser(name='update',
                                           help='update jar and data of HanLP')

    def add_args(p):
        p.add_argument("--config",
                       default=PATH_CONFIG,
                       help='path to hanlp.properties')
        # p.add_argument("--action", dest="action", default='predict',
        #                help='Which action (train, test, predict)?')

    add_args(segment_parser)
    add_args(parse_parser)
    add_args(parse_keyword)
    add_args(parse_summary)

    if '-v' in sys.argv or '--version' in sys.argv:
        print('jar  {}: {}'.format(HANLP_JAR_VERSION, HANLP_JAR_PATH))
        data_version = hanlp_installed_data_version()
        print('data {}: {}'.format(data_version if data_version else '自定义',
                                   HANLP_DATA_PATH))
        print('config    : {}'.format(
            os.path.join(STATIC_ROOT, 'hanlp.properties')))
        exit(0)

    args = arg_parser.parse_args()

    def eprint(*args, **kwargs):
        print(*args, file=sys.stderr, **kwargs)

    def die(msg):
        eprint(msg)
        exit(1)

    if hasattr(args, 'config') and args.config:
        if os.path.isfile(args.config):
            JClass('com.hankcs.hanlp.utility.Predefine'
                   ).HANLP_PROPERTIES_PATH = args.config
        else:
            die('Can\'t find config file {}'.format(args.config))

    if args.task == 'segment':
        segmenter = None
        try:
            segmenter = HanLP.newSegment(args.algorithm)
        except JException as e:
            if isinstance(e, java.lang.IllegalArgumentException):
                die('invalid algorithm {}'.format(args.algorithm))
            elif isinstance(e, java.lang.RuntimeException):
                die('failed to load required model')
            else:
                die('unknown exception {}'.format(repr(e)))

        is_lexical_analyzer = hasattr(segmenter, 'analyze')
        if not args.tag:
            if is_lexical_analyzer:
                segmenter.enablePartOfSpeechTagging(False)
                JClass('com.hankcs.hanlp.HanLP$Config').ShowTermNature = False
            else:
                JClass('com.hankcs.hanlp.HanLP$Config').ShowTermNature = False
        for line in sys.stdin:
            line = line.strip()
            print(' '.join(term.toString()
                           for term in segmenter.seg(any2utf8(line))))
    elif args.task == 'parse':
        for line in sys.stdin:
            line = line.strip()
            print(HanLP.parseDependency(any2utf8(line)))
    elif args.task == 'keyword':
        for line in sys.stdin:
            line = line.strip()
            TextRankKeyword = JClass(
                "com.hankcs.hanlp.summary.TextRankKeyword")
            keyword_list = HanLP.extractKeyword(line, 3)
            print(keyword_list)
            #print(HanLP.parseDependency(any2utf8(line)))
    elif args.task == 'summary':
        for line in sys.stdin:
            line = line.strip()
            TextRankSentence = JClass(
                "com.hankcs.hanlp.summary.TextRankSentence")
            sentence_list = HanLP.extractSummary(line, 3)
            print(sentence_list)
    elif args.task == 'serve':
        if PY == 3:
            from pyhanlp import server
            server.run(port=args.port)
        else:
            die('现在server.py暂时不支持Python2,欢迎参与移植')
    elif args.task == 'update':
        if hanlp_installed_data_version() == '手动安装':
            die('手动配置不支持自动升级,若要恢复自动安装,请清除HANLP相关环境变量')
        else:
            from pyhanlp.static import update_hanlp
            update_hanlp()
Пример #5
0
def extract_summa(s):
    summa = HanLP.extractSummary(s, 15, r'[ ]')
    return [x for x in summa if 10< len(x) <40]
Пример #6
0
    return [s for s in summaries if 'c' not in extract_s_nature(s) and 'cc' not in extract_s_nature(s)]

sql = "SELECT distinct game_id FROM game_source.s_game_comments_taptap_game WHERE source='taptap'"
game_id_list = from_sql(sql)
game_s = pd.DataFrame()
from datetime import datetime
for game_id in tqdm(list(game_id_list['game_id'])):
    sql = """
    SELECT source, game_id, game_name, content FROM game_source.s_game_comments_taptap_game where game_id = %s and length(content)>300
    """ % (game_id, )
    df = from_sql(sql)
    
    df['content'] = df['content'].apply(clear_text)
    df['content'] = df['content'].apply(get_help_content)

    df['summaries']=df['content'].apply(lambda x: list(HanLP.extractSummary(x, 1, r'[ ]')))
    df_new = df[['source', 'game_id', 'game_name', 'summaries']].groupby(['source', 'game_id','game_name']).agg(join).reset_index()
    
    #做了两次的提取摘要,
    try:
        df_new['summaries'] = df_new['summaries'].apply(extract_summa)
        df_new['summaries'] = df_new['summaries'].apply(clear_summa)
    except:
        print('summar is []')
        continue

    if df_new.iloc[0]['summaries'] == []:
        continue

    trg_db='game_process'
    trg_table='c_lcs_game_comment_summary'