Python TextProcess.wordSegment示例

编程语言: Python

命名空间/包名称: Tools

类/类型: TextProcess

方法/功能: wordSegment

hotexamples.com的示例: 4

Python TextProcess.wordSegment - 已找到4个示例。这些是从开源项目中提取的最受好评的Tools.TextProcess.wordSegment现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

postag(8)

jieba_initialize(4)

wordSegment(4)

cut(3)

load_baikeattr_name(2)

load_synonyms_word_inattr(2)

示例#1

显示文件

def qa(question):

    #初始化jieba分词器
    T.jieba_initialize()

    #切换到语料库所在工作目录
    mybot_path = './'
    # os.chdir(mybot_path)

    mybot = aiml.Kernel()
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] +
        "/resources/std-startup.xml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] + "/resources/bye.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] + "/resources/tools.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] + "/resources/bad.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] + "/resources/funny.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] +
        "/resources/OrdinaryQuestion.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] +
        "/resources/Common conversation.aiml")
    # mybot.respond('Load Doc Snake')
    #载入百科属性列表

    input_message = question

    if len(input_message) > 60:
        print(mybot.respond("句子长度过长"))
    elif input_message.strip() == '':
        print(mybot.respond("无"))

    print(input_message)
    message = T.wordSegment(input_message)
    # 去标点
    print('word Seg:' + message)
    print('词性：')
    words = T.postag(input_message)
    if message == 'q':
        exit()
    else:
        response = mybot.respond(message)

        print("=======")
        print(response)
        print("=======")

        if response == "":
            ans = mybot.respond('找不到答案')
            # print(robot_id + ":" + ans)
            print("{0}:{1}".format(robot_id, ans))
        # 百科搜索
        elif response[0] == '#':
            # 匹配百科
            if response.__contains__("searchbaike"):
                print("search from baike")
                print(response)
                res = response.split(':')
                #实体
                entity = str(res[1]).replace(" ", "")
                #属性
                attr = str(res[2]).replace(" ", "")
                print(entity + '<---->' + attr)

                ans = baike.query(entity, attr)
                # 如果命中答案
                if type(ans) == list:
                    print("{0}:{1}".format(robot_id, QAT.ptranswer(ans,
                                                                   False)))
                elif ans.decode('utf-8').__contains__(u'::找不到'):
                    #百度摘要+Bing摘要
                    print("通用搜索")
                    log.info("通用搜索")
                    ans = search_summary.kwquery(input_message)

            # 匹配不到模版，通用查询
            elif response.__contains__("NoMatchingTemplate"):
                print("NoMatchingTemplate")
                ans = search_summary.kwquery(input_message)

            if len(ans) == 0:
                ans = mybot.respond('找不到答案')
                logs.info("{0}:{1}".format(robot_id, ans))
            elif len(ans) > 1:
                logs.info(sys.exc_info())
                logs.info("不确定候选答案")
                logs.info("[{0}][func:{1}][line:{2}]:不确定候选答案".format(
                    sys._getframe().f_code.co_filename,
                    sys._getframe().f_code.co_name,
                    sys._getframe().f_lineno))
                print(robot_id + ': ')
                for a in ans:
                    print(a)
                    # print(a.encode("utf8"))
            else:
                print('{0}:{1}'.format(robot_id, ans[0]))

        # 匹配模版
        else:
            print("{}: {}".format(robot_id, response))

示例#2

显示文件

文件： main.py 项目： zyyyyy/MillionHeroAssistant

    def __inner_job():
        start = time.time()
        text_binary = analyze_current_screen_text(
            directory=data_directory, compress_level=image_compress_level[0])

        keywords = get_text_from_image(image_data=text_binary, )
        if not keywords:
            print("text not recognize")
            return

        true_flag, question, answers = parse_question_and_answer(keywords)
        #questions=question.decode('unicode-escape')
        #new_ans=[]
        #for ans in answers:
        # new_ans.append(ans.decode('unicode-escape'))

        print('-' * 72)
        print(question)
        print('-' * 72)
        print("\n".join(answers))

        # notice browser
        if enable_chrome:
            with question_obj.get_lock():
                question_obj.value = question
                keyboard.press("space")

        search_question = pre_process_question(question)
        summary = baidu_count(search_question, answers, timeout=timeout)
        summary_li = sorted(summary.items(),
                            key=operator.itemgetter(1),
                            reverse=True)
        data = [("选项", "同比")]
        for a, w in summary_li:
            data.append((a, w))
        table = AsciiTable(data)
        print(table.table)

        print("*" * 72)
        if true_flag:
            print("肯定回答(**)： ", summary_li[0][0])
            print("否定回答(  )： ", summary_li[-1][0])
        else:
            print("肯定回答(  )： ", summary_li[0][0])
            print("否定回答(**)： ", summary_li[-1][0])
        print("*" * 72)

        ##############################################################
        input_message = question

        if len(input_message) > 60:
            print(mybot.respond("句子长度过长"))
        elif input_message.strip() == '':
            print(mybot.respond("无"))

        #print(input_message)
        message = T.wordSegment(input_message)
        # 去标点
        #print('word Seg:' + message)
        #print('词性：')
        words = T.postag(input_message)

        if message == 'q':
            exit()
        else:
            response = mybot.respond(message)

            #print("=======")
            #print(response)
            #print("=======")

            if response == "":
                ans = mybot.respond('找不到答案')
                print('Eric：' + ans)
            # 百科搜索
            elif response[0] == '#':
                # 匹配百科
                if response.__contains__("searchbaike"):
                    #print("searchbaike")
                    #print(response)
                    res = response.split(':')
                    # 实体
                    entity = str(res[1]).replace(" ", "")
                    # 属性
                    attr = str(res[2]).replace(" ", "")
                    #print(entity + '<---->' + attr)

                    ans = baike.query(entity, attr)
                    # 如果命中答案
                    if type(ans) == list:
                        print('Eric：' + QAT.ptranswer(ans, False))

                    elif ans.decode('utf-8').__contains__(u'::找不到'):
                        # 百度摘要+Bing摘要
                        print("通用搜索")
                        ans = search_summary.kwquery(input_message)

                # 匹配不到模版，通用查询
                elif response.__contains__("NoMatchingTemplate"):
                    #print("NoMatchingTemplate")
                    ans = search_summary.kwquery(input_message, answers)

                if len(ans) == 0:
                    print('Eric：' + '找不到答案')
                elif len(ans) > 1:
                    print("不确定候选答案")
                    print('Eric: ')
                    for a in ans:
                        print(a)
                else:
                    print('Eric：' + ans[0])

            # 匹配模版
            else:
                print('Eric：' + response)

        end = time.time()
        print("use {0} 秒".format(end - start))
        save_screen(directory=data_directory)

示例#3

显示文件

def main():
    # 初始化jb分词器
    T.jieba_initialize()

    # 切换到语料库所在工作目录
    mybot_path = './'
    os.chdir(mybot_path)

    # 加载AIML的规则
    mybot = aiml.Kernel()
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] +
        "/resources/std-startup.xml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] + "/resources/abc.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] +
        "/resources/bot_profile.aiml")
    # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/general.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] + "/resources/infor.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] + "/resources/main.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] +
        "/resources/new07281.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] +
        "/resources/salutations.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] +
        "/resources/virus0727.aiml")
    # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/zextra_weibao.aiml")
    # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/bye.aiml")
    # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/tools.aiml")
    # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/bad.aiml")
    # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/funny.aiml")
    # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/OrdinaryQuestion.aiml")
    # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/Common conversation.aiml")

    # mybot.respond('Load Doc Snake')
    #载入百科属性列表

    #     print '''
    # .----------------.  .-----------------. .----------------.  .----------------.  .----------------.
    # | .--------------. || .--------------. || .--------------. || .--------------. || .--------------. |
    # | |    _______   | || | ____  _____  | || |      __      | || |  ___  ____   | || |  _________   | |
    # | |   /  ___  |  | || ||_   \|_   _| | || |     /  \     | || | |_  ||_  _|  | || | |_   ___  |  | |
    # | |  |  (__ \_|  | || |  |   \ | |   | || |    / /\ \    | || |   | |_/ /    | || |   | |_  \_|  | |
    # | |   '.___`-.   | || |  | |\ \| |   | || |   / /__\ \   | || |   |  __'.    | || |   |  _|  _   | |
    # | |  |`\____) |  | || | _| |_\   |_  | || | _/ /    \ \_ | || |  _| |  \ \_  | || |  _| |___/ |  | |
    # | |  |_______.'  | || ||_____|\____| | || ||____|  |____|| || | |____||____| | || | |_________|  | |
    # | |              | || |              | || |              | || |              | || |              | |
    # | '--------------' || '--------------' || '--------------' || '--------------' || '--------------' |
    #  '----------------'  '----------------'  '----------------'  '----------------'  '----------------'
    #  Eric：你好，我是Eric。╭(╯^╰)╮
    #     '''

    print('泰康小康：你好，我是小康。╭(╯^╰)╮')

    # 对问题的处理流程
    while True:
        # 输入这个问题
        input_message = raw_input("您想问什么 >> ")

        # 对输入问题进行简单的处理：
        # 忽略过长（超过60）的问题
        # 忽略空问题
        if len(input_message) > 60:
            print(mybot.respond("句子长度过长"))
            continue
        elif input_message.strip() == '':
            print(mybot.respond("无"))
            continue

        # 利用Tools工具对问题进行处理
        print(input_message)
        message = T.wordSegment(input_message)
        # 去标点
        print('word Seg:' + message)
        # print('词性：')
        words = T.postag(input_message)

        # 退出
        if message == 'q':
            exit()
        # 返回信息的优先级
        else:
            # 首先是AIML的模板匹配
            response = mybot.respond(message)

            print("=======")
            print(response)
            print(len(response.decode('utf8')))
            print("=======")

示例#4

显示文件

def find_ans(question=''):
    global raw
    log = '答案来源：'
    cnt = 0
    input_message = question
    if len(input_message) > 60:
        return (mybot.respond("句子长度过长"), log)
    elif input_message.strip() == '':
        return (mybot.respond("无"), log)

    # 检索本地知识库得到答案，代码在QA1文件夹
    cnt += 1
    log += str(cnt) + ':检索本地知识库\n'
    old_client = Client()
    ans, log_tmp = old_client.qa_find_ans(input_message)
    log += log_tmp
    if ans != "不知道~":
        return (clean_str(ans), log)        
    log += '本地知识库找不到答案或答案不确定\n' 


    message = T.wordSegment(input_message)

    response = mybot.respond(message)
    # log += 'AIML模板返回内容:' + response + '\n'
    if response == "":
        ans = mybot.respond('不知道~')
        return (ans, log)
    elif response[0] == '#':
        cnt += 1
        log += str(cnt) + ':匹配不到问句模板\n'
        if response.__contains__("searchbaike"):
            res = response.split(':')
            entity = str(res[1]).replace(" ","")
            attr = str(res[2]).replace(" ","")
            cnt += 1
            log += str(cnt) + ':匹配到实体属性模板,' + '实体:' + entity + ' 属性:' + attr + '\n'
            ans = baike.query(entity, attr)
            if type(ans) == list:
                cnt += 1
                log += str(cnt) + ':来自百科Infobox\n'
                return (QAT.ptranswer(ans,False), log)
            elif '-找不到' in ans:
                cnt += 1
                log += str(cnt) + ':百科Infobox查询不到:' + ans + '\n'
                cnt += 1
                log += str(cnt) + '来自搜索\n'
                (ans, tmplog) = search_summary.kwquery(input_message)
                log += tmplog                    

        elif response.__contains__("NoMatchingTemplate"):
            cnt += 1
            log += str(cnt) + ':匹配不到实体关系模板\n'
            cnt += 1
            log += str(cnt) + ':来自搜索\n'
            (ans,tmplog) = search_summary.kwquery(input_message)
            log += tmplog

        if len(ans) == 0:
            cnt += 1
            log += str(cnt) + ':未查询到答案\n'
            return (mybot.respond('不知道~'), log)

        elif len(ans) >1:
            cnt += 1
            log += str(cnt) + ':返回百度摘要\n'
            if raw == False and ('什么是' in question or '是什么' in question):
                result = "给你找到几篇新闻："
                for a in ans:
                    result += a + '\n'
                return (result, log)
            else:
                raw = False
                question = question.replace("是什么", "").replace("什么是", "")
                ans2, log2 = find_ans(question + "是什么")
                ans1, log1 = find_ans("什么是" + question) 
                if "给你找到几篇新闻" not in ans1:
                    return (ans1, log1)
                else:
                    return (ans2, log2)
        else:
            return (clean_str(ans[0]), log)



    # 直接匹配问句模版
    else:
        cnt += 1
        log += str(cnt) + ':匹配问句模板\n'
        return (clean_str(response), log)