Python TextProcess示例，TextProcess Python示例

示例#1

0

显示文件

文件： views.py 项目： natebot13/MailTrail

def text():
    print('Receiving text...')
    message = request.values.get('Body', None)
    person = request.values.get('From', None)
    jdata = {}
    if "textReg.json" in os.listdir("."):
        with open("textReg.json", "r") as jfile:
            jdata = json.load(jfile)

    if len(message.split()) > 1 and message.split()[0] == "switch":
        jdata[person] = message.split()[1]

    if person in jdata:
        gamename = jdata[person] + "@nathanp.me"
    else:
        gamename = '*****@*****.**'
        jdata[person] = "treehacks"

    with open("textReg.json", "w") as jfile:
        json.dump(jdata, jfile)

    if not message or not person:
        return 'Incorrect POST data'
    TextProcess.evalAndRespond(person, message, gamename)
    return 'OK'

示例#2

0

显示文件

文件： views.py 项目： natebot13/MailTrail

def email():
    print('Receiving email...')
    email = request.values.get('to', None)
    person = request.values.get('from', None)
    text = request.values.get('text', None)
    if not email or not person:
        return "Incorrect POST data"
    if not text:
        text = ""
    TextProcess.evalAndRespond(person, text, email)
    return 'OK'

示例#3

0

显示文件

文件： baike.py 项目： muyuwuxin/QA-Snake

def query(entity,attr):
    soup = To.get_html_baidu("http://baike.baidu.com/item/"+entity)
    basicInfo_block = soup.find(class_= 'basic-info cmn-clearfix')
    if basicInfo_block == None:
        # print 'info None'
        return attr + "::找不到"
    else:
        info  = get_info(basicInfo_block)
        # for i in info:
        #     print i
        #     print info[i]
        # print '-----------'
        if info.has_key(attr.decode('utf8')):
            # print 'has key'+attr.decode('utf8')
            return info[attr.decode('utf8')]
        else:
            # print 'no key 进行同义词判断'
            # 同义词判断
            attr_list = T.load_baikeattr_name('./resources/Attribute_name.txt')
            attr = T.load_synonyms_word_inattr(attr,'./resources/SynonDic.txt',attr_list)
            if info.has_key(attr.decode('utf8')):
                return info[attr.decode('utf8')]
            else:
                return attr + "::找不到"

示例#4

0

显示文件

文件： search_summary.py 项目： muyuwuxin/QA-Snake

def kwquery(query):
    #分词 去停用词 抽取关键词
    keywords = []
    words = T.postag(query)
    for k in words:
        # 只保留名词
        if k.flag.__contains__("n"):
            # print k.flag
            # print k.word
            keywords.append(k.word)

    answer = []
    text = ''
    # 找到百科的答案就置1
    flag = 0

    # 抓取百度前10条的摘要
    soup_baidu = To.get_html_baidu('https://www.baidu.com/s?wd=' +
                                   quote(query))

    for i in range(1, 10):
        if soup_baidu == None:
            break
        results = soup_baidu.find(id=i)

        if results == None:
            print "百度摘要找不到答案"
            break
        # print '============='
        # print results.attrs
        # print type(results.attrs)
        # print results['class']
        #判断是否有mu,如果第一个是百度知识图谱的 就直接命中答案
        if results.attrs.has_key('mu') and i == 1:
            # print results.attrs["mu"]
            r = results.find(class_='op_exactqa_s_answer')

            # print r
            if r == None:
                print "百度知识图谱找不到答案"
                # continue
            else:
                # print r.get_text()
                print "百度知识图谱找到答案"
                answer.append(r.get_text().strip())
                flag = 1
                break

        #古诗词盘判断
        if results.attrs.has_key('mu') and i == 1:
            r = results.find(class_="op_exactqa_detail_s_answer")

            if r == None:
                print "百度诗词找不到答案"
                # continue
            else:
                # print r.get_text()
                print "百度诗词找到答案"
                answer.append(r.get_text().strip())
                flag = 1
                break

        #计算器
        if results.attrs.has_key(
                'mu') and i == 1 and results.attrs['mu'].__contains__(
                    'http://open.baidu.com/static/calculator/calculator.html'):

            r = results.find('div').find_all('td')[1].find_all('div')[1]

            if r == None:
                print "计算器找不到答案"
                # continue
            else:
                # print r.get_text()
                print "计算器找到答案"
                answer.append(r.get_text().strip())
                flag = 1
                break

        # 百度知道答案
        if results.attrs.has_key('mu') and i == 1:
            r = results.find(class_='op_best_answer_question_link')
            if r == None:
                print "百度知道图谱找不到答案"
            else:
                print "百度知道图谱找到答案"
                url = r['href']
                zhidao_soup = To.get_html_zhidao(url)
                r = zhidao_soup.find(class_='bd answer').find('pre')
                answer.append(r.get_text())
                flag = 1
                break

        if results.find("h3") != None:
            if results.find("h3").find("a").get_text().__contains__(
                    u"百度知道") and i == 1:
                url = results.find("h3").find("a")['href']
                if url == None:
                    print "百度知道图谱找不到答案"
                    continue
                else:
                    print "百度知道图谱找到答案"
                    zhidao_soup = To.get_html_zhidao(url)

                    r = zhidao_soup.find(class_='bd answer')
                    if r == None:
                        continue
                    else:
                        r = r.find('pre')

                    answer.append(r.get_text().strip())
                    flag = 1
                    break

        text += results.get_text()

    if flag == 1:
        return answer

    #获取bing的摘要
    soup_bing = To.get_html_bing('https://www.bing.com/search?q=' +
                                 quote(query))
    # 判断是否在Bing的知识图谱中
    bingbaike = soup_bing.find(class_="b_xlText b_emphText")

    if bingbaike != None:
        print "Bing知识图谱找到答案"
        flag = 1
        answer.append(bingbaike.get_text())
        # print "====="
        # print answer
        # print "====="
        return answer
    else:
        print "Bing知识图谱找不到答案"
        results = soup_bing.find(id="b_results")
        text += results.get_text()

    # print text

    # 如果再两家搜索引擎的知识图谱中都没找到答案，那么就分析摘要
    if flag == 0:
        #分句
        cutlist = [u"。", u"?", u".", u"_", u"-", u":", u"！", u"？"]
        temp = ''
        sentences = []
        for i in range(0, len(text)):
            if text[i] in cutlist:
                if temp == '':
                    continue
                else:
                    # print temp
                    sentences.append(temp)
                temp = ''
            else:
                temp += text[i]

        # 找到含有关键词的句子,去除无关的句子
        key_sentences = {}
        for s in sentences:
            for k in keywords:
                if k in s:
                    key_sentences[s] = 1

        # 根据问题制定规则

        # 识别人名
        target_list = {}
        for ks in key_sentences:
            # print ks
            words = T.postag(ks)
            for w in words:
                # print "====="
                # print w.word
                if w.flag == ("nr"):
                    if target_list.has_key(w.word):
                        target_list[w.word] += 1
                    else:
                        target_list[w.word] = 1

        # 找出最大词频
        sorted_lists = sorted(target_list.items(),
                              lambda x, y: cmp(x[1], y[1]),
                              reverse=True)
        # print len(target_list)
        #去除问句中的关键词
        sorted_lists2 = []
        # 候选队列
        for i, st in enumerate(sorted_lists):
            # print st[0]
            if st[0] in keywords:
                continue
            else:
                sorted_lists2.append(st)

        print "返回前n个词频"
        answer = []
        for i, st in enumerate(sorted_lists2):
            # print st[0]
            # print st[1]
            if i < 3:
                # print st[0]
                # print st[1]
                answer.append(st[0])
        # print answer

    return answer

示例#5

0

显示文件

#coding:utf8
import aiml
import os
import TextProcess as T
import Tools as QAT
from QACrawler import baike
from QACrawler import search_summary

if __name__ == '__main__':

    #初始化jb分词器
    T.jieba_initialize()

    #切换到语料库所在工作目录
    mybot_path = './'
    os.chdir(mybot_path)

    mybot = aiml.Kernel()
    mybot.learn("./resources/std-startup.xml")
    mybot.respond('Load Doc Snake')
    #载入百科属性列表

    print '''
.----------------.  .-----------------. .----------------.  .----------------.  .----------------.
| .--------------. || .--------------. || .--------------. || .--------------. || .--------------. |
| |    _______   | || | ____  _____  | || |      __      | || |  ___  ____   | || |  _________   | |
| |   /  ___  |  | || ||_   \|_   _| | || |     /  \     | || | |_  ||_  _|  | || | |_   ___  |  | |
| |  |  (__ \_|  | || |  |   \ | |   | || |    / /\ \    | || |   | |_/ /    | || |   | |_  \_|  | |
| |   '.___`-.   | || |  | |\ \| |   | || |   / /__\ \   | || |   |  __'.    | || |   |  _|  _   | |
| |  |`\____) |  | || | _| |_\   |_  | || | _/ /    \ \_ | || |  _| |  \ \_  | || |  _| |___/ |  | |
| |  |_______.'  | || ||_____|\____| | || ||____|  |____|| || | |____||____| | || | |_________|  | |