示例#1
0
def main():
    # fnam = "../data/word2vec/zh-cn/wiki_texts_seg.txt.bin"
    # model = models.Word2Vec.load(fnam)
    jieba.set_dictionary('jieba_dict/dict.txt.big')

    # load stopwords set
    stopwordset = set()
    with open('jieba_dict/stopwords.txt', 'r', encoding='utf-8') as sw:
        for line in sw:
            stopwordset.add(line.strip('\n'))

    word_set = set()

    # 问题路径
    # path1 = "../data/nlpcc2016/nlpcc-iccpol-2016.kbqa.training.testing-data-all.txt"
    path1 = '../data/nlpcc2016/6-answer/q.rdf.ms.re.v2.txt'
    # rdf路径
    #
    lines = ct.file_read_all_lines(path1)
    # lines =['《机械设计基础》这本书的作者是谁','鑫威kw9000es是个什么产品']
    result_lines = []
    for line in lines:
        sentence = str(line).split('\t')[0]
        # line = "《机械设计基础》这本书的作者是谁"
        sentence = ct.clean_str_question(sentence)
        # 读取所有的line
        words = jieba.cut(sentence, cut_all=False)
        result_lines.append('%s' % ' '.join(words))
    ct.file_wirte_list('../data/nlpcc2016/4-ner/seg/sentence.v6.txt',
                       result_lines)
    print('done')
示例#2
0
 def re_write(f1, f2):
     """将问题格式转换"""
     f1s = ct.file_read_all_lines_strip(f1)
     f2s = []
     for l1 in f1s:
         if str(l1).__contains__('question id'):
             f2s.append(str(l1).split('\t')[1].replace(' ', '').lower())
     ct.file_wirte_list(f2, f2s)
示例#3
0
 def re_write_m2id(f1, f_out):
     f1s = ct.file_read_all_lines_strip(f1)  # 读取所有的问题
     f2 = []
     for l1 in f1s:
         l1 = str(l1).replace(' ', '').replace('|||', '\t')
         l1 = ct.clean_str_s(l1)
         f2.append(l1)
     ct.file_wirte_list(f_out, f2)
     pass
示例#4
0
def prepare_data():
    f1 = '../data/nlpcc2016/6-answer/q.rdf.ms.re.v1.txt'
    f3 = '../data/nlpcc2016/4-ner/extract_entitys_all_tj.txt'
    f4 = '../data/nlpcc2016/4-ner/extract_entitys_all_tj.sort_by_ner_lstm.txt'
    f1s = ct.file_read_all_lines_strip(f1)
    f3s = ct.file_read_all_lines_strip(f3)
    f1s_new = []
    f3s_new = []
    for i in range(len(f1s)):
        if str(f1s[i]).__contains__('NULL'):
            continue
        f1s_new.append(f1s[i])
        f3s_new.append(f3s[i])

    # 过滤NULL
    # 获取候选实体逐个去替代和判断

    # cs.append('立建候时么什是♠')
    # 读取出所有候选实体并打分取出前3 看准确率

    f4s = []
    _index = -1
    for l1 in f1s_new:  # 遍历每个问题
        _index += 1
        replace_qs = []
        for l3 in f3s_new[_index].split('\t'):
            q_1 = str(l1).split('\t')[0].replace(l3, '♠')
            replace_qs.append((q_1, l3))
        entitys = []
        for content, l3 in replace_qs:
            # content = input("input:")
            r1 = '1'
            entitys.append((l3, r1))
            # print(content)
            # print(r1)
            # print(score_list)
        entitys.sort(key=lambda x: x[1])
        entitys_new = [x[0] for x in entitys]

        f4s.append('\t'.join(entitys_new))
    ct.file_wirte_list(f4, f4s)
    def ner_re_writer(f1='../data/nlpcc2016/ner_t1/q.rdf.m_s.filter.txt',
                      f2='../data/nlpcc2016/class/q.rdf.m_s.filter.re_writer.txt'):
        """
        重写问句库
        """
        # 1. 读取问句库
        # 2. 替换问句并输出

        f1s = ct.file_read_all_lines_strip(f1)
        f1s_new = []
        for f1s_l in f1s:
            s1 = str(f1s_l).split('\t')
            e1 = str(f1s_l).split('\t')[5]
            q1 = str(f1s_l).split('\t')[0].replace(' ','').lower()
            # .replace('','♠')
            q2 = str(q1).replace(e1, '♠')
            s1.append(q2)
            f1s_new.append('\t'.join(s1))
        ct.file_wirte_list(f2, f1s_new)

        print(1)
示例#6
0
    def class1(f5='../data/nlpcc2016/5-class/class1.txt',
               f1="../data/nlpcc2016/2-kb/kb-use.v2.txt"):
        bkh = baike_helper()
        bkh.init_spo(f_in=f1)
        keys = bkh.kbqa.keys()
        ps_dict = dict()  # key =  '\t'.join(list(v1)) value =
        for key in keys:
            vs = bkh.kbqa.get(key)
            # if vs[0]
            #  vs[1]
            vs = list(vs)
            vs1 = [x for x in vs[1]]
            if len(ct.clean_str_answer(vs1)) == len(set(ct.clean_str_answer(vs1))):
                # 答案里面没有一样的就跳过
                continue
            _vs_dict = dict()
            # 遍历每个KEY的VS,如果值重复则记录 相同的属性对 到全局里面
            for _vs in vs:
                if _vs[1] in _vs_dict:
                    # if _vs[0] in ps_dict:
                    #     ps_dict[_vs[0]] += 1
                    # else:
                    #     ps_dict[_vs[0]] = 1

                    s1 = _vs_dict[_vs[1]]
                    s1.add(_vs[0])
                    _vs_dict[_vs[1]] = s1
                else:
                    s1 = set()
                    s1.add(_vs[0])
                    _vs_dict[_vs[1]] = s1
            # 去除 属性值- (属性1,属性2) 序列,
            # 将属性1,属性2作为KEY ,次数作为VALue 存到全局
            for (k1, v1) in _vs_dict.items():
                if len(list(v1)) <= 1:
                    continue
                key1 = '\t'.join(list(v1))

                if key1 in ps_dict:
                    ps_dict[key1] += 1
                else:
                    ps_dict[key1] = 1

        tp = ct.sort_dict(ps_dict, True)
        f5s = []
        for t in tp:
            f5s.append("%s\t%s" % (t[0], t[1]))
        ct.file_wirte_list(f5, f5s)

        keys = ps_dict.keys()
        words_bag_list = []
        for key in keys:

            words = set(str(key).split('\t'))

            exist = False
            wl_index = -1
            for word in words:  # 遍历每个单词
                for wl_index in range(len(words_bag_list)):  # 这个单词去匹配一遍所有的
                    if word in words_bag_list[wl_index]:
                        exist = True
                        break
                if exist:
                    break
                    # 把当前的words全部整合进去
            if exist:
                for word in words:  # 遍历每个单词
                    words_bag_list[wl_index].add(word)
            else:
                s1 = set()
                for word in words:  # 遍历每个单词
                    s1.add(word)
                words_bag_list.append(s1)
        # 输出 words_bag_list
        f5s = []
        for words_bag in words_bag_list:
            f5s.append('\t'.join(list(words_bag)))
        ct.file_wirte_list(f5 + '.combine.txt', f5s)
示例#7
0
    def class2(f5='../data/nlpcc2016/5-class/class2.txt',
               f1="../data/nlpcc2016/3-questions/q.rdf.ms.re.v1.filter.txt"):
        f1s = ct.file_read_all_lines_strip(f1)
        f1s_new = [str(x).split('\t')[6] for x in f1s]

        q_patten_set = set()
        q_patten_dict = dict()
        q_count_dict = dict()
        for f1_line in f1s_new:
            q_patten_set.add(f1_line)
        # for q1 in q_patten_set:
        #     q_patten_dict[q1] = set()
        #     q_count_dict[q1] = 0

        gc1 = ct.generate_counter()
        for q1 in q_patten_set:  # 遍历唯一问题集合
            for f1_line in f1s:  # 遍历问题集合
                index = gc1()
                if index % 100000 == 0:
                    print("%d - %d " % (index / 100000, len(q_patten_set) * len(f1s) / 100000))
                _q1 = str(f1_line).split('\t')[6]
                _ps = str(f1_line).split('\t')[3]
                q1 = str(q1)
                if _q1!= '♠' and  _q1.__contains__(q1):  # 相等 或者 包含?
                    if q1 in q_patten_dict:
                        s1 = q_patten_dict[q1]
                        s1.add(_ps)
                        q_patten_dict[q1] = s1
                        q_count_dict[q1] += 1
                    else:
                        s1 = set()
                        s1.add(_ps)
                        try:
                            q_patten_dict[q1] = s1
                        except Exception as e11:
                            print(e11)
                        q_count_dict[q1] = 1

        tp = ct.sort_dict(q_count_dict)
        f5s = []
        for t in tp:
            f5s.append("%s\t%s\t%s" % (t[0], t[1], '\t'.join(list(q_patten_dict[t[0]]))))
        ct.file_wirte_list(f5, f5s)

        #  -------

        keys = q_patten_dict.keys()
        words_bag_list = []
        for key in keys:

            # words = set(str(key).split('\t'))
            words = q_patten_dict.get(key)  # words  规划总面积	建筑面积	显示器尺寸	面积	占地总面积

            exist = False
            wl_index = -1
            for word in words:  # 遍历每个单词
                for wl_index in range(len(words_bag_list)):  # 这个单词去匹配一遍所有的
                    if word in words_bag_list[wl_index]:
                        exist = True
                        break
                if exist:
                    break
                    # 把当前的words全部整合进去
            if exist:
                wbl = words_bag_list[wl_index]
                for word in words:  # 遍历每个单词
                    wbl.add(word)
                words_bag_list[wl_index] = wbl
            else:
                s1 = set()
                for word in words:  # 遍历每个单词
                    s1.add(word)
                words_bag_list.append(s1)
        # 输出 words_bag_list
        f5s = []
        for words_bag in words_bag_list:
            f5s.append('\t'.join(list(words_bag)))
        ct.file_wirte_list(f5 + '.combine.txt', f5s)
示例#8
0
        list1_new = [
            baike_helper.entity_re_extract_one_repeat(ct.clean_str_zh2en(x))
            for x in res2
        ]
        # 去掉重复
        list1_new = ct.list_no_repeat(list1_new)  # 去掉重复
        # 去掉包含
        # 5.8.3 去掉词语包含试试 有一首歌叫	有一首歌	一首歌
        if True:
            # 能略微提高
            list1_new_2 = []
            for list1_new_word in list1_new:
                if not ct.be_contains(list1_new_word, list1_new):
                    list1_new_2.append(list1_new_word)
            list1_new = list1_new_2
        ct.print(list1_new)

    # 使用jieba分词出实体
    if False:
        # 读取别名字典,然后将其按格式 名字 长度 n 输出
        names = ct.file_read_all_lines_strip_no_tips(
            config.cc_par('alias_dict'))
        alias = [str(x).split('\t')[0] for x in names]
        # alias = list(sorted(alias,key=lambda x:len(x),reverse=True))
        alias2 = ["%s %d n" % (ct.clean_str_s(str(x)), len(x)) for x in alias]
        path1 = '../word2vec-test/jieba_dict/dict.txt.big'
        path2 = '../data/nlpcc2016/4-ner/extract_e/dict.txt.big'
        ct.file_wirte_list(path1, alias2)
        print('done')
    print('done')
示例#9
0
    if len(str(f2s[i]).split('\t')) < 4:
        continue
    print(i)
    if i == 36:
        print(3333)
    p1 = str(f1s[i]).split('\t')[2].lower()
    p2 = str(f2s[i]).split('\t')[2].lower()
    if p1.replace(' ', '') != p2.replace(' ', ''):
        # 比较这2个属性,谁在句子中的词多
        line = str(f1s[i]).split('\t')[0]

        count1 = math1(p1)
        count2 = math1(p2)

        l1_append = ''
        l2_append = ''

        if count1 > count2:
            l1_append = '\t@@@@'
        elif count1 < count2:
            l2_append = '\t@@@@'
        else:
            print('==')


        l1.append(f1s[i] + l1_append)
        l2.append(f2s[i] + l2_append)

ct.file_wirte_list('../data/nlpcc2016/ner_t1/q.rdf.compare-1.txt', l1)
ct.file_wirte_list('../data/nlpcc2016/ner_t1/q.rdf.compare-2.txt', l2)
示例#10
0
def main(_):
    # prepare_data()
    # FLAGS.start_string = FLAGS.start_string.decode('utf-8')
    # converter = TextConverter(filename=FLAGS.converter_path)
    if os.path.isdir(FLAGS.checkpoint_path):
        FLAGS.checkpoint_path = \
            tf.train.latest_checkpoint(FLAGS.checkpoint_path)

    model_path = os.path.join('model', FLAGS.name)
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)
    model = 'ner'
    dh = data_helper.DataClass(model)
    train_batch_size = 1
    # g = dh.batch_iter_char_rnn(train_batch_size)  # (FLAGS.num_seqs, FLAGS.num_steps)
    embedding_weight = dh.embeddings

    model = CharRNN(dh.converter.vocab_size,  # 词汇表大小 从其中生成所有候选
                    num_seqs=train_batch_size,  # FLAGS.num_seqs,  # ? 一个batch 的 句子 数目
                    num_steps=dh.max_document_length,  # FLAGS.num_steps,  # 一个句子的长度
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    learning_rate=FLAGS.learning_rate,
                    train_keep_prob=FLAGS.train_keep_prob,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size,
                    embedding_weight=embedding_weight,
                    sampling=True,
                    dh=dh
                    )

    model.load(FLAGS.checkpoint_path)
    # cs = []
    # cs.append('♠是什么类型的产品')
    # cs.append('♠是谁')
    # cs.append('♠是哪个公司的长度')
    f1 = '../data/nlpcc2016/6-answer/q.rdf.ms.re.v1.txt'
    f3 = '../data/nlpcc2016/4-ner/extract_entitys_all_tj.v1.txt'
    f4 = '../data/nlpcc2016/4-ner/extract_entitys_all_tj.sort_by_ner_lstm.v1.txt'
    f1s = ct.file_read_all_lines_strip(f1)
    f3s = ct.file_read_all_lines_strip(f3)
    f1s_new = []
    f3s_new = []
    for i in range(len(f1s)):
        # if str(f1s[i]).__contains__('NULL'):
        #     continue
        f1s_new.append(f1s[i])
        f3s_new.append(f3s[i])

    # 过滤NULL
    # 获取候选实体逐个去替代和判断

    # cs.append('立建候时么什是♠')
    # 读取出所有候选实体并打分取出前3 看准确率

    f4s = []
    _index = -1
    for l1 in f1s_new:  # 遍历每个问题
        _index += 1
        replace_qs = []
        for l3 in f3s_new[_index].split('\t'):
            q_1 = str(l1).split('\t')[0].replace(l3, '♠')
            replace_qs.append((q_1, l3))
        entitys = []
        for content, l3 in replace_qs:
            # content = input("input:")
            start = dh.convert_str_to_indexlist_2(content, False)

            # arr = model.sample(FLAGS.max_length, start, dh.converter.vocab_size,dh.get_padding_num())
            # #converter.vocab_size
            r1, score_list = model.judge(start, dh.converter.vocab_size)
            entitys.append((l3, r1))
            # print(content)
            # print(r1)
            # print(score_list)
            ct.print("%s\t%s\t%s" % (content, l3, r1), 'debug_process')
        entitys.sort(key=lambda x: x[1])
        entitys_new = [x[0] for x in entitys]
        ct.print('\t'.join(entitys_new))
        f4s.append('\t'.join(entitys_new))
    ct.file_wirte_list(f4, f4s)