def baidu_ym(question):
    question = question.replace('Ж', '_')
    baidu_rus = bdzdym.baiduyemian(question)

    papa = baidu_rus.strip().replace('\ufeff', '').replace(
        '\u3000',
        '').replace('\xa0',
                    '').replace('\n', '').replace('\r', '').replace('\\', '')
    print('页面::', papa)
    gqsavetojson.gqtojson(papa, question)
    aresult, b_b = run_mrc.abc()
    baidu_a = aresult['text'].strip().replace('\'',
                                              '').replace('\r', '').replace(
                                                  '\n', '').replace('\"', '')
    baidu_par = papa[0:1000]  #太长就留1000字吧
    if baidu_a in mrc_p:
        baidu_a = ''
    return baidu_a, baidu_par
示例#2
0
def test_zyb(question):

    a1 = time.time()  # 初始计时
    #time.sleep(0.3)  # 设置问题间隔时间
    # 1、清理问题
    question = question.replace('+', '')
    question = clear_input(question)  # 将问题疑问部分转换为Ж
    print('清理过字符的问题==', question)

    # 2、 与现有问题库进行相似度计算
    similar_a, similar_par = similar(question)
    similar_a = ''

    if similar_a != '':  # 在3000中有相似并成功计算的的则不为空,直接返回答案

        aa = similar_a
        paqu = similar_par
        fromwhere = '从相似度计算得来'
        a = correction(question, aa)  # 修正后答案
        daily(question, paqu, aa, a, fromwhere)  # 保存日志
        a2 = time.time()
        print('相似度计算计算耗时==', str(a2 - a1), a)
        return a

    else:
        # 3、百度知道计算
        if question.find('Ж') == -1:
            question += ",_"
        print('qqqqqqqqqqqqq', question)
        # bdzd_para = bdzdqa.Query(question)['Answer']  # 百科知道问答的爬取和计算
        bdzd_para0 = bdzdym.sim_baidu(question)  # 百科知道问答的爬取和计算
        bzz = bdzd_para0.strip().replace('\ufeff', '').replace(
            '\u3000',
            '').replace('\xa0', '').replace('\n',
                                            '').replace('\r',
                                                        '').replace('\\', '')
        fromwhere = '从百度知道得来'
        if len(bzz) < 100:
            bzz = bdzdym.baiduyemian(question.replace('Ж', '_')).replace(
                '\ufeff',
                '').replace('\u3000', '').replace('\xa0', '').replace(
                    '\n', '').replace('\r', '').replace('\\', '').strip()
            print('搜搜爬取', bzz[:1500])  # 太长就留1500字吧
            fromwhere = '从soso页面得来'
        sp_bzz = re.split(r'[。;!]', bzz)
        bdzd_a = ''
        print(bzz)
        print(len(sp_bzz), '个大句子   ', len(bzz), '字!')

        YNQ = None  #判别是非问题
        for YNR in YN_list:
            YNQ = re.search(re.compile(YNR), question)
            if YNQ:
                break

        if YNQ:  #对Yes_No问题,先找依据再回答,共四次预测
            for tail in YN_tails:

                gqtojson(bzz[:800] + YN_text, question + tail)
                an, an1 = run_mrc.abc(
                    init_check='data/models/step_30000')  #换模型
                if an['text'].find('没有答案') == -1:
                    bdzd_a += '小呆找依据:' + an['text'] + '...'

            try:
                gqtojson(bzz[:min(800, len(bzz))] + YN_text, question)
                yn, yns = run_mrc.abc(init_check='data/models/step_30000')
                if yn['text'] in ['(是的)', '(不是)', '(不一定)']:
                    bdzd_a += '小萌说了算:' + yn['text'] + '##'
            except Exception as e:
                print(e)
            print(yn, yns)
            print('YN******', bdzd_a)

        else:
            gqtojson(bzz[:min(1500, len(bzz))], question)
            an, ans = run_mrc.abc()
            if len(re.split(r'[。;!]', an['text'])) > 2 or len(
                    an['text']) > 100:
                gqtojson(an['text'], question)
                an, ans = run_mrc.abc()
            print(an, '\n', ans)
            bdzd_a += '小呆会翻书:' + an['text'] + '##'

            if an['text'] in mrc_p:
                wh = 0
                while an['text'] not in mrc_p or wh < len(ans):
                    print('whian::', ans[wh]['text'])
                    bdzd_a = ans[wh]['text']
                    wh += 1

            gqtojson(bzz[:min(1500, len(bzz))], an['text'] + '_')
            an1, ans1 = run_mrc.abc()
            print(an1, '\n', ans1)
            bdzd_a = bdzd_a[:-2] + '...小萌插一句:' + an1['text'] + '##'
            print("bdzd_a+++", bdzd_a)

        if bdzd_a != '':  # 百度页面成功解析并输出答案
            a = correction(question, bdzd_a)  # 修正后答案
            daily(question, bzz, bdzd_a, a, fromwhere)  # 保存日志
            a2 = time.time()
            print('baidu页面计算耗时', str(a2 - a1), a)
            return a
        else:
            a = '算了半天还是没有答案'
            baidu_par = '瞅瞅爬到啥' + str(baidu_par)
            fromwhere = '哪儿都没找到'
            daily(question, baidu_par, a, fromwhere)  # 保存日志
            a2 = time.time()
            print('计算耗时', str(a2 - a1), a)
            return a
示例#3
0
def similar(question):
    # 保存的文件。开始应该初始化为空list文件[],

    # print('file_save_list', file_save_list)

    file_save = open('data/newlist.txt', 'r', encoding='UTF-8').read()
    file_save_list = eval(file_save.replace('\ufeff', ''))
    s_q = question
    same = []
    s_q = s_q.strip('\n')
    for qu in file_save_list:
        qu = qu.strip('\n')
        cut = s_q[0:3]
        if cut.find('Ж') > -1:
            cut = s_q[-4:]
        if qu.find(cut) == -1:
            continue

        if qu == s_q:
            same.append(qu)

        else:
            part = [None, None, None]
            p0 = s_q.find('Ж')
            p1 = qu.find('Ж')
            ques = [None, None]  # ‘Ж’位置在前的放0位,在后放1
            if p0 > p1:
                ques[0] = qu
                ques[1] = s_q
            else:
                ques[1] = qu
                ques[0] = s_q
            part[0] = ques[0][:min(p0, p1)]  # 两个问句靠前那个Ж之前那部分
            part[2] = ques[1][max(p0, p1) + 1:]  # 两个问句靠后那个Ж之后那部分
            p2 = ques[0].find(part[2])  # Ж靠前的句子,找到靠后Ж之后部分位置
            if p2 > -1:
                part[1] = ques[0][min(p0, p1) + 1:p2]
            else:
                # print(part[2], ques[0])
                continue

            p3 = ques[1].find(part[0])
            if p3 == -1:
                # print(part[0],ques[1])
                continue
            p3 = p3 + len(part[0])  # Ж靠后的句子,找到靠前Ж之后部分位置

            part1_2 = ques[1][p3:max(p0, p1) - 1]

            cc_mat = len(part[0]) + len(part[2])
            cc = 0

            while cc in range(0, len(part[1])):
                if part1_2.find(part[1][cc]) > -1:
                    cc_mat += 1
                cc += 1
            if cc_mat > len(qu) * 0.66:
                same.append(qu)
    #如果所问问题不在题库里,则讲新问题存入题库
    if question not in file_save_list:
        file_save_list = list(file_save_list)
        file_save_list.append(question)
        file_save2 = open('data/newlist.txt', 'w+', encoding='UTF-8')
        file_save2.write(str(file_save_list))
        file_save2.close()

    # print('len(file_save_list)', len(file_save_list), file_save_list)
    # if len(Qus)>len(file_save_list):
    if 0 < len(same):
        #print('len(same)', question, len(same), same)

        # if len(same)==0:

        # 进行模型计算newlist.txt
        try:
            if question in same:
                same.remove(question)
            papa = same[0]
            #print('same第一个==',papa)
            gqtojson(str(papa), question)  # 保存test1.json 计算
            three_a = run_mrc.abc()
            print('相似度计算结果==', three_a)
            three_par = papa
        except:
            three_a = ''
            three_par = ''
    else:
        three_a = ''
        three_par = ''

    if three_a in mrc_p or 'Ж' in three_a:
        three_a = ''
        three_par = ''
    # print('lastsame=same',same,len(same))
    #print('three_a=three_par', three_a, three_par)
    return three_a, three_par
def test_zyb(question):
    print('question::::::::::', question)
    a1 = time.time()  # 初始计时
    #time.sleep(0.3)  # 设置问题间隔时间
    # 1、清理问题
    question = question.strip('+').replace('+', '')
    question = clear_input(question)  # 将问题疑问部分转换为Ж
    print('清理过字符的问题==', question)

    # 2、 与现有问题库进行相似度计算
    similar_a, similar_par = similar(question)

    if similar_a != '':  # 在3000中有相似并成功计算的的则不为空,直接返回答案

        aa = similar_a
        paqu = similar_par
        fromwhere = '从相似度计算得来'
        a = correction(question, aa)  # 修正后答案
        daily(question, paqu, aa, a, fromwhere)  # 保存日志
        a2 = time.time()
        print('相似度计算计算耗时==', str(a2 - a1), a)
        return a
    elif re.search(r'喻体|比喻句|本体', question):
        gqsavetojson.gqtojson(re.split(r'[,,]', question)[0], question)
        an, an1 = run_mrc.abc()
        return an['text']
    else:

        # 3、百度知道计算
        bdzd_aa, bdzd_para = bd_ym(question)
        bdzd_ab, bdzd_parb = baike_js(question)
        bzz = bdzd_para + bdzd_parb
        sp_bzz = re.split(r'[。;!]', bzz)
        print(len(sp_bzz), '个大句子   ', len(bzz), '字!')
        print(bzz)
        len_pa = max(len(bzz) / 3, 350)
        sp_b = ''
        ans = []
        biyu_talk = ''
        qas_texts = [
            '本文中的比喻句是:Ж', '@key,这句话中的喻体是Ж', '@key,这句话的本体是Ж',
            '@key,这个比喻中的类比属性是Ж'
        ]
        for sb in range(0, len(sp_bzz)):
            big_s = re.split(r'[。!\n]|\.\.\.', sp_bzz[sb])  # 大句子切割
            if re_list != []:
                for big in big_s:
                    for r_b in re_list:
                        biyu = re.search(re.compile(r_b), big)
                        if biyu:
                            b_ana = biyu_ana(biyu, big)
                            if b_ana[0] == '':
                                continue
                            if biyu_talk.find(b_ana[0]) > -1:
                                print('break重复')
                                break
                            evals = ['', '', '', '']

                            for ii in range(0, 4):
                                b_ana[ii] = b_ana[ii].strip(' ')
                                if b_ana[ii] != '':
                                    qii = qas_texts[ii].replace(
                                        '@key', b_ana[0])
                                    gqsavetojson.gqtojson(b_ana[-1], qii)
                                    an, an1 = run_mrc.abc()
                                    evals[ii] = an['text']
                            if biyu_talk.find(evals[0]) > -1:
                                print('break2......')
                                break
                            biyu_talk += '本文中的比喻句有:Ж' + evals[0] + '\n'
                            if b_ana[1] != '' and b_ana[2] != '':
                                bytk = '把@k1比作@k2'.replace('@k1',
                                                           evals[2]).replace(
                                                               '@k2', evals[1])
                                if b_ana[3] != '':
                                    bytk += ',同样@k3'.replace('@k3', evals[3])
                                biyu_talk += bytk + '\n'
                            print('evals[2]::b_ana[2]', evals[2],
                                  len(b_ana[2]), b_ana[2])

            try:
                sp_b += sp_bzz[sb] + '。'
                if len(sp_b) > len_pa or sb == len(sp_bzz) - 1:
                    gqsavetojson.gqtojson(sp_b, question)
                    an, an1 = run_mrc.abc()
                    print('ananananana', an, an1)
                    an['text'] = an['text'].strip().replace('\'', '').replace(
                        '\r', '').replace('\n', '').replace('\"', '')
                    if an not in ans:
                        ans.append(an)
                        ans.append(an1)
                    sp_b = ''

            except:
                continue

        ans = fix_prob(ans, question,
                       sp_bzz)  # 检查答案所在的每个大句子,包含多少个问题的字词和顺序,调整probability评分
        ans = sorted(
            ans,
            key=lambda x: (x["probability"]),  # 多个段落的两个答案,重新按可能性排序
            reverse=True)
        print(len(ans), 'ans::++', ans)

        if len(ans) > 0:
            bdzd_a = ans[0]['text']
        else:
            bdzd_a = ''

        bdzd_par = bzz
        if bdzd_a in mrc_p:
            bdzd_a = ''

        if bdzd_a != '':
            aa = bdzd_a
            paqu = bdzd_par
            fromwhere = '从知道计算得来'
            a = correction(question, aa)  # 修正后答案

            daily(question, paqu, aa, a, fromwhere)  # 保存日志

            a2 = time.time()
            print('知道计算耗时', str(a2 - a1), a)
            return '小呆的查询:' + a + '##'

        else:  #4、百度知道没有结果,则进行百度页面爬取
            print('爬百度页面::', question)
            baidu_a, baidu_par = baidu_ym(question)

            if baidu_a in mrc_p:
                baidu_a = ''

            if baidu_a != '':  # 百度页面成功解析并输出答案
                aa = baidu_a
                paqu = baidu_par
                fromwhere = '从baidu页面得来'

                a = correction(question, aa)  # 修正后答案
                daily(question, paqu, aa, a, fromwhere)  # 保存日志
                a2 = time.time()
                print('baidu页面计算耗时', str(a2 - a1), a)
                return a
            else:
                aa = '算了半天还是没有答案'
                a = '算了半天还是没有答案'
                paqu = '瞅瞅爬到啥' + str(baidu_par)
                fromwhere = '哪儿都没找到'
                daily(question, paqu, aa, a, fromwhere)  # 保存日志
                a2 = time.time()
                print('计算耗时', str(a2 - a1), a)
                return a
def riji_test(que='你喜欢唱歌跳舞吗', f1='概念.我.xml', no='我所知'):  #本模块没用
    p_xml = 'D:/YYY/'
    tree0 = ET.parse(p_xml + f1)
    root = tree0.getroot()

    exp_my = []
    ws = list(jieba.cut(que))
    for exp in root.iter(no):  #找到节点参数的大XML
        myexp = exp
        break
    for ex in myexp:
        sco = 0
        if not ex.text:
            continue
        topic = ex.tag
        if topic == '日记':
            topic += '+' + ex.attrib['标题']
        if topic == '象声词':  # 先放过
            continue
        sp_text = re.split(r'[。;!\n]', ex.text)
        ssp = ''
        for sp in range(len(sp_text)):
            if len(sp_text[sp]) < 3:
                continue
            ssp += sp_text[sp] + '。'
            if len(ssp) < 300 and sp != len(sp_text) - 1:
                continue
            mw = []
            for w in ws:
                if w in ['是', '有', '了', '在', '的', ',', '?']:  #排除太常见的词
                    continue
                if topic.find(w) > -1:
                    sco += 10
                if ex.text.find(w) > -1:
                    if w not in mw:
                        mw.append(w)
                        sco += len(w)

            if sco > 3:
                eee = (topic + '\n' + ssp, sco, mw)
                exp_my.append(eee)
            ssp = ''
            sco = 0

    exp_my = sorted(
        exp_my,
        key=lambda x: (x[1]),  # 多个段落的两个答案,重新按可能性排序
        reverse=True)
    print('exp_my::', exp_my[:3])

    ans = []
    for ex in exp_my[:min(3, len(exp_my))]:
        gqsavetojson.gqtojson(ex[0], que)
        an, an1 = run_mrc.abc()
        an['text'] = an['text'].strip().replace('\'', '').replace(
            '\r', '').replace('\n', '').replace('\"', '')
        if an not in ans:
            ans.append(an)
            ans.append(an1)
    ans = sorted(
        ans,
        key=lambda x: (x["probability"]),  # 多个段落的两个答案,重新按可能性排序
        reverse=True)
    print('ans::', len(ans), ans)
    return ans[0]['text']