Пример #1
0
def test():
    str1 = '1998年/,/经/过/统/一/部/署/,/伊/犁/州/,/地/两/级/党/委/开/始/尝/试/以/宣/讲/团/的/形/式/,/深/入/学/校/,/村/民/院/落/,/田/间/地/头/,/向/各/族/群/众/进/行/面/对/面/宣/讲/.'
    words = str1.split('/')
    print(words)
    ws = WordSequence()
    ws.fit(words)
    print(ws.dict)
Пример #2
0
def test_transform_sentence():
    ws = WordSequence()
    ws.fit([
        ['你', '好', '啊'],
        ['你', '好', '哦']
    ])
    indice = transform_sentence(['我', '们', '好'],max_len=4,add_end=False,ws=ws)
    print(indice)

    back = ws.inverse_transform(indice[0],ignore_pad=True,ignore_unk=False,ignore_start=True,ignore_end=True)
    print(back)
Пример #3
0
def generate(max_len=10, size=1000, same_len=False, seed=0):
    '''
    生成虚假的数据
    :param max_len:
    :param size:
    :param same_len:
    :param seed:
    :return:
    '''

    dictionary = {
        'a': '1',
        'b': '2',
        'c': '3',
        'd': '4',
        'aa': '1',
        'bb': '2',
        'cc': '3',
        'dd': '4',
        'aaa': '1'
    }

    if seed is not None:
        random.seed(seed)

    input_list = sorted(list(dictionary.keys()))
    print('input_list: ', input_list)

    x_data = []
    y_data = []

    for x in range(size):
        a_len = int(random.random() * max_len) + 1
        x = []
        y = []
        #这个循环一次构建一次问答对,问通过append添加到x_data中,一个问题的长度和一个答案的长度不一样长很正常,这个就是执行if not same_len语句的作用。
        for _ in range(a_len):
            word = input_list[int(random.random() * len(input_list))]
            x.append(word)
            y.append(dictionary[word])
            if not same_len:
                if y[-1] == '2':
                    y.append('2')
                elif y[-1] == '3':
                    y.append('3')
                    y.append('4')

        x_data.append(x)
        y_data.append(y)
    print('x_data', x_data)
    print('y_data', y_data)

    ws_input = WordSequence()
    ws_input.fit(x_data)

    ws_target = WordSequence()
    ws_target.fit(y_data)
    return x_data, y_data, ws_input, ws_target
def generate(max_len=10, size=1000, same_len=False, seed=0):
    """
    生成测试方法用的数据
    :param max_len: seq最大长度
    :param size: seq数量
    :param same_len: x_data中的seq与y_data中的seq是否等长
    :param seed: 随机种子
    :return: x_data, y_data: list,伪问答对序列
             ws_input, ws_target:WordSequence对象,词表保存在self.dict中
    """
    dictionary = {
        'a': '1',
        'b': '2',
        'c': '3',
        'd': '4',
        'aa': '1',
        'bb': '2',
        'cc': '3',
        'dd': '4',
        'aaa': '1',
    }

    if seed is not None:
        random.seed(seed)

    # 输入词表
    input_list = sorted(list(dictionary.keys()))

    x_data = []  # seq
    y_data = []  # 标签

    # 生成伪问答对
    for i in range(size):
        a_len = int(random.random() * max_len) + 1
        x = []
        y = []
        for _ in range(a_len):
            word = input_list[int(random.random() * len(input_list))]
            x.append(word)
            y.append(dictionary[word])
            if not same_len:
                if y[-1] == '2':
                    y.append('2')
                elif y[-1] == '3':
                    y.append('3')
                    y.append('4')
        x_data.append(x)
        y_data.append(y)

    ws_input = WordSequence()
    ws_input.fit(x_data)

    ws_target = WordSequence()
    ws_target.fit(y_data)
    return x_data, y_data, ws_input, ws_target
Пример #5
0
def generate_Chinese_ws(Chinese_corpors_path):
    '''
    读取chineses_corpors.txt中的内容,生成中文文本对应的Word_Sequence
    :param Chinese_corpors_path:
    :return:
    '''
    with codecs.open(Chinese_corpors_path, 'r', 'utf-8') as f:
        lines = f.readlines()
        bar = tqdm.tqdm(lines)
        words = []
        for line in bar:
            line = line.strip()
            words.extend(line.split(' '))

        ws = WordSequence()
        ws.fit(words, min_count=2)
        pickle.dump(ws, open(corpors_path, 'wb'))
        print(len(ws.dict))
Пример #6
0
def generate(max_len=10, size=1000, same_len=False, seed=0):
    """
    生成虚假数据
    :param max_len:
    :param size:
    :param same_len:
    :param seed: 随机种子
    :return:
    """

    dictionary = {
        'a': '1',
        'b': '2',
        'c': '3',
        'd': '4',
        'aa': '5',
        'bb': '6',
        'cc': '7',
        'dd': '8',
        'aaa': '9',
    }

    if seed is not None:
        random.seed(seed)

    input_list = sorted(list(dictionary.keys()))

    x_data = []
    y_data = []

    for x in range(size):
        a_len = int(random.random() * max_len) + 1
        x = []
        y = []
        for _ in range(a_len):
            word = input_list[int(random.random() * len(input_list))]
            x.append(word)
            y.append(dictionary[word])
            if not same_len:
                if y[-1] == '2':
                    y.append('2')
                elif y[-1] == '3':
                    y.append('3')
                    y.append('4')
        x_data.append(x)
        y_data.append(y)

    ws_input = WordSequence()
    ws_input.fit(x_data)

    ws_target = WordSequence()
    ws_target.fit(y_data)
    return x_data, y_data, ws_input, ws_target
Пример #7
0
def main(limit=100):
    """执行程序
    Args:
        limit: 只输出句子长度小于limit的句子
    """
    from word_sequence import WordSequence

    x_data, y_data = [], []

    x, y = read_txt('train.txt')
    x_data += x
    y_data += y

    x, y = read_txt('validation.txt')
    x_data += x
    y_data += y

    x, y = read_txt('test.txt')
    x_data += x
    y_data += y

    print(len(x_data))

    print(x_data[:10])
    print(y_data[:10])

    print('tokenize')

    data = list(zip(x_data, y_data))
    data = [(x, y) for x, y in data if len(x) < limit and len(y) < limit]
    x_data, y_data = zip(*data)

    print(x_data[:10])
    print(y_data[:10])

    print(len(x_data), len(y_data))

    print('fit word_sequence')

    ws_input = WordSequence()
    ws_target = WordSequence()
    ws_input.fit(x_data, min_count=1)
    ws_target.fit(y_data, min_count=1)

    print('dump')

    pickle.dump((x_data, y_data, ws_input, ws_target), open('ner.pkl', 'wb'))

    print('done')
Пример #8
0
def generate_english_ws(corpors_en_path='./datas/english.txt'):
    '''
    生成英文的WordSequence
    :param corpors_en_path:
    :return:
    '''
    with codecs.open(corpors_en_path, 'r', 'utf-8') as f:
        lines = f.readlines()
        bar = tqdm.tqdm(lines)
        totoal_words = []
        for line in bar:
            line = line.strip()
            words = line.split(' ')
            totoal_words.extend(words)

        print('done')
        ws = WordSequence()
        ws.fit(totoal_words, min_count=2)
        print('单词的数目:', len(totoal_words))
        print('预料的行数:', len(lines))
        print('wordsequence的长度是:', len(ws.dict))
        with open('datas/corpors_en.pkl', 'wb') as g:
            pickle.dump(ws, g)
        print('generate done')
Пример #9
0
def main(limit=20, x_limit=3, y_limit=6):
    from word_sequence import WordSequence

    print('extract lines')
    fp = open("dgk_shooter_min.conv", 'r', errors='ignore', encoding='utf-8')
    groups = []
    group = []

    #长循环可以利用进度条模式
    for line in tqdm(fp):
        #若一直循环都是以M开头则属于一个对话 放在 一个group里
        if line.startswith('M '):
            line = line.replace('\n', '')

            if '/' in line:
                line = line[2:].split('/')
            else:
                line = list(line[2:])
            line = line[:-1]
            #清洗句子,并将每个句子以字为单位切分放入一个list
            group.append(list(regular(''.join(line))))
        else:
            if group:
                groups.append(group)
                group = []
    if group:
        groups.append(group)
        group = []
    print('extract group')

    #产生问答对,其代码意思为 若问答对一共4句 最后为1-2,2-3,3-4,1-23,2-34,12-3,23-4,如果问或者答是由两个句子组成则加上,号

    #也可以先利用如下代码产生问答对的下标组合再进行处理
    #     double_list=list()
    #     triple_list=list()
    #     n=5
    #     for i in range(n):
    #     if i<=n-1:
    #         double_list.append([i,i+1])
    #     if i<=n-2:
    #         triple_list.append([i,i+1,i+2])

    x_data = []
    y_data = []
    for group in tqdm(groups):
        for i, line in enumerate(group):
            #last_line为当前句子的前一个句子
            last_line = None
            if i > 0:
                last_line = group[i - 1]
                #last_line不是正常句子last_line就为空
                if not good_line(last_line):
                    last_line = None

            #若当前句子下标为问答对最后第一个之前则next_line为当前句的下一个句子
            next_line = None
            if i < len(group) - 1:
                next_line = group[i + 1]
                if not good_line(next_line):
                    next_line = None

            #若当前句子下标为问答对最后第个之前则next_next_line为当前句的下一个的下一个句子
            next_next_line = None
            if i < len(group) - 2:
                next_next_line = group[i + 2]
                if not good_line(next_next_line):
                    next_next_line = None

            #若存在next_line存在则添加[[line],[next_line]]
            if next_line:
                x_data.append(line)
                y_data.append(next_line)

            #存在last_line和next_line存在则添加[[last_line+line],[next_line]]
            if last_line and next_line:
                x_data.append(last_line + make_split(last_line) + line)
                y_data.append(next_line)

            #存在next_line和next_next_line存在则添加[[line],[next_line+ next_next_line]]
            if next_line and next_next_line:
                x_data.append(line)
                y_data.append(next_line + make_split(next_line) +
                              next_next_line)

    print(len(x_data), len(y_data))

    #遍历展示前20个问答对
    for ask, answer in zip(x_data[:20], y_data[:20]):
        print(''.join(ask))
        print(''.join(answer))
        print('-' * 20)
    #data中的每一条包含两个列表,分别是问答
    data = list(zip(x_data, y_data))

    #筛选句子,若问答对有一个句子长度大于规定的最大长度 或者 问或者答小于相应的最小长度则删除
    data = [
        (x, y)
        for x, y in data
        if len(x) < limit \
        and len(y) < limit \
        and len(y) >= y_limit \
        and len(x) >= x_limit
    ]
    #重新分为X,Y
    x_data, y_data = zip(*data)

    print('fit word_sequence')

    #拟合字典
    ws_input = WordSequence()
    ws_input.fit(x_data + y_data)

    print('dump')

    pickle.dump((x_data, y_data), open('chatbot.pkl', 'wb'))
    pickle.dump(ws_input, open('ws.pkl', 'wb'))

    print('done')
Пример #10
0
def main(limit=35,  # 句子长度
         x_limit=1,  # 问句长度
         y_limit=1):  # 答句长度
    from word_sequence import WordSequence

    print('extract lines')
    """dgk语料"""
    # fp = open("raw_data/dgk_shooter_min.conv", 'r', errors='ignore', encoding='utf-8')
    """xiaohuangji语料"""
    fp = open("raw_data/richor2.conv", 'r', errors='ignore', encoding='utf-8')

    # 保存全部句子列表
    groups = []
    # 保存一行
    group = []

    for line in tqdm(fp):  # 显示进度条

        if line.startswith('M '):  # 句子处理M开头
            line = line.replace('\n', '')  # 去掉回撤

            if '/' in line:
                line = line[2:].split('/')  # 去掉斜杠 -> return <list>
                line = list(regular(''.join(line)))  # 去掉不好的句子

                line = jieba.lcut(''.join(line))  # jieba分词
            else:
                line = list(line[2:])

            group.append(line)

        else:  # E开头句子---line.startswith('E ')
            if group:
                groups.append(group)
                group = []
    print(groups)
    if group:
        groups.append(group)
        group = []

    print('\nextract group')

    """定义问答对"""
    x_data = []
    y_data = []

    for group in tqdm(groups):
        for index, line in enumerate(group):
            last_line = None
            if index > 0:
                last_line = group[index - 1]
                # 去掉无用句子
                if not good_line(last_line):
                    last_line = None
            next_line = None
            if index < len(group) - 1:
                next_line = group[index + 1]
                if not good_line(next_line):
                    next_line = None
            next_next_line = None
            if index < len(group) - 2:
                next_next_line = group[index + 2]
                if not good_line(next_next_line):
                    next_next_line = None

            if next_line:
                x_data.append(line)
                y_data.append(next_line)
            if last_line and next_line:
                x_data.append(last_line + make_split(last_line) + line)
                y_data.append(next_line)
            if next_line and next_next_line:
                x_data.append(line)
                y_data.append(next_line + make_split(next_line) + next_next_line)

    # 问答对数据量
    print('\n问句数量:' + str(len(x_data)), '答句数量:' + str(len(y_data)))

    # 将问答对放入zip object(至多20字符)
    for ask, answer in zip(x_data[:30], y_data[:30]):
        print(''.join(ask))
        print(''.join(answer))
        print('-' * 20)

    """组装数据"""
    data = list(zip(x_data, y_data))

    # 组装规则:
    data = [
        (x, y) for x, y in data
        if len(x) < limit and len(y) < limit and len(y) >= y_limit and len(x) >= x_limit
    ]
    x_data, y_data = zip(*data)

    # word_sequence模型训练
    print('fit word_sequence')
    ws_input = WordSequence()
    ws_input.fit(x_data + y_data)

    # 保存 (pickle格式)
    print('dump')
    pickle.dump(
        (x_data, y_data),
        # open('data/dgk_chatbot.pkl', 'wb')
        open('chatbot_2/richor_chatbot.pkl', 'wb')
    )
    pickle.dump(ws_input, open('chatbot_2/richor_ws.pkl', 'wb'))

    print('done')
Пример #11
0
def main(limit=20, x_limit=3, y_limit=6):
    from word_sequence import WordSequence

    print('extract lines')
    fp = open("dgk_shooter_min.conv", 'r', errors='ignore', encoding='utf-8')
    groups = []
    group = []

    for line in tqdm(fp):
        if line.startswith('M '):
            line = line.replace('\n', '')

            if '/' in line:
                line = line[2:].split('/')
            else:
                line = list(line[2:])

            line = line[:-1]
            group.append(regular(''.join(line)))

        else:
            if group:
                groups.append(group)
                group = []

    if group:
        groups.append(group)
        group = []

    print('extracts group')
    x_data = []
    y_data = []
    for group in tqdm(groups):
        for index, data in enumerate(group):  #这index就是序号,data就是group中的语句。
            last_line = None
            if index > 0:
                last_line = group[index - 1]
                if not good_line(last_line):
                    last_line = None
            #取到上一行

            next_line = None
            if index < len(group) - 1:
                next_line = group[index + 1]
                if not good_line(next_line):
                    next_line = None
            #取到下一行

            #取到下下一行
            next_next_line = None
            if index < len(group) - 2:
                next_next_line = group[index + 2]

                if not good_line(next_next_line):
                    next_next_line = None

            if next_line:
                x_data.append(group[index])
                y_data.append(next_line)
                #当前行作为问题,下一行作为答案

                if last_line:
                    x_data.append(last_line + make_split(last_line) +
                                  group[index])
                    y_data.append(next_line)
                #如果有上一行,那么上一行加当前行作为问,下一行作为答

                if next_next_line:
                    x_data.append(data)
                    y_data.append(next_line + make_split(next_line) +
                                  next_next_line)
                #如果有下下一行,当前行做为问,下一行+下下一行作为答

    print(len(x_data), len(y_data))
    for ask, answer in zip(x_data[:20], y_data[:20]):
        print(ask)
        print(answer)
        print('*' * 20)

    print('fit data')
    data = list(zip(x_data, y_data))
    data = [(x, y) for x, y in data if len(x) < limit and len(y) < limit
            and len(x) >= x_limit and len(y) >= y_limit]

    #这里可以看出来需要将对的tupledump到pickle中
    pickle.dump(data, open('chatbot.pkl', 'wb'))
    ws = WordSequence()
    ws.fit(x_data + y_data)  #需要把语句对中的词汇生成一个字典,key是单词,value是字典。
    pickle.dump(ws, open('WordSequence.pkl', 'wb'))  #把字典dump到语句中。
    print('done')
Пример #12
0
from word_sequence import WordSequence
from dataset import get_dataloader
import pickle
from tqdm import tqdm

if __name__ == '__main__':
    ws = WordSequence()
    train_data = get_dataloader(train=True)
    test_data = get_dataloader(train=False)
    for content, labels in tqdm(train_data):
        # content是包含batch_size个评论
        for sentence in content:
            # 计算词频
            ws.fit(sentence)
    # 开始构建词典
    ws.build_vocab(min_count=5, max_count=10000)
    print(len(ws))
    # dump jiang数据通过特殊的形式转换为只有python语言认识的字符串,并写入文件
    pickle.dump(ws, open("./model/ws.pkl", "wb"))
def main(limit=20, x_limit=1, y_limit=2):
    """文本数据处理main函数
    args: limit——问句和答句的最大长度;
          x_limit——问句的最小长度;
          y_limit——答句的最小长度
    """
    from word_sequence import WordSequence

    print('extracting lines')
    fp = open("xiaohuangji50w_fenciA.conv",
              'r',
              errors='ignore',
              encoding='utf-8')
    groups = []
    group = []

    # 文本规范化
    for line in tqdm(fp):
        if line.startswith('M '):
            line = line.replace('\n', '')  # 去除回车符
            # 查看文本文件,具体情况具体设计
            if '/' in line:
                line = line[2:].split('/')
            else:
                line = list(line[2:])
            line = line[:-1]
            group.append(list(regular(''.join(line))))
        else:
            if group:
                groups.append(group)
                group = []
    # 最后一次加入的group,在M 开头行后没有E 开头的下一行,退出循环,group有值
    if group:
        groups.append(group)
        del group

    # 问答对处理(文件为电影台词)
    x_data = []  # 问
    y_data = []  # 答
    for group in tqdm(groups):
        for i, line in enumerate(group):
            # pre_line:group多于两行时,i行的前一行
            pre_line = None
            if i > 0:
                pre_line = group[i - 1]
                if not good_line(pre_line):
                    last_line = None
            # next_line:第i行在倒数第一行之前时,下一行
            next_line = None
            if i < len(group) - 1:
                next_line = group[i + 1]
                if not good_line(next_line):
                    next_line = None
            # next_next_line :第i行在倒数第二行之前时,下一行的下一行
            next_next_line = None
            if i < len(group) - 2:
                next_next_line = group[i + 2]
                if not good_line(next_next_line):
                    next_next_line = None

            # 如果当前行的下一行存在,i行加入x_data问句,i+1行加入y_data答句
            if next_line:
                x_data.append(line)
                y_data.append(next_line)
            # 如果当前行的上一行和下一行都存在,i-1和i行加入x_data问句,i+1行加入y_data答句
            # make_split:合并两句为一个序列所采用的分隔符
            if pre_line and next_line:
                x_data.append(pre_line + make_split(pre_line) + line)
                y_data.append(next_line)
            # 如果当前行的下一行和下下一行都存在,i行加入x_data问句,i+1和i+2行加入y_data答句
            if next_line and next_next_line:
                x_data.append(line)
                y_data.append(next_line + make_split(next_line) +
                              next_next_line)

    print(len(x_data), len(y_data))

    # 打印前20组问答对结果
    for ask, answer in zip(x_data[:20], y_data[:20]):
        print(''.join(ask))
        print(''.join(answer))
        print('-' * 20)

    data = list(zip(x_data, y_data))
    data = [(x, y) for x, y in data if len(x) < limit and len(y) < limit
            and len(y) >= y_limit and len(x) >= x_limit]

    # 长度筛选之后的问答句子
    x_data, y_data = zip(*data)

    # 对输入的语料进行词统计,生成词表
    print('fit word_sequence')
    ws_input = WordSequence()
    ws_input.fit(x_data + y_data)

    # 保存问答句子和词表
    print('dump')
    # 打包保存问答句子
    pickle.dump((x_data, y_data), open('chatbot.pkl', 'wb'))
    # 打包保存词表
    pickle.dump(ws_input, open('ws.pkl', 'wb'))

    print('done')
Пример #14
0
def main(limit=20, x_limit=3, y_limit=6):
    # 导入自己定义的类
    from word_sequence import WordSequence

    print('extract lines')
    fp = open('dgk_shooter_min.conv', 'r', errors='ignore',
              encoding='utf-8')  # 只读  忽略 utf-8编码打开
    groups = []  # 后面变为[group, group, group, group]形式,groups是三维
    group = []  # group用来存行,group是二维

    for line in tqdm(fp):  # 传入进度条,以进度条形式显示
        if line.startswith('M '):
            line = line.replace('\n', '')  # 去掉回车
            if '/' in line:
                line = line[2:].split('/')  # M 什/么/事/儿/这/么/急/啊/  ,以斜杆切分
            else:
                line = list(line[2:])  # 从第二个开始只是要去掉空格
            line = line[:
                        -1]  # 去掉最后一个空格  ['邱', '先', '生', '戏', '刚', '一', '看', '完', '信', '就', '来', '啦']

            group.append(
                list(regular(''.join(line)))
            )  # '邱先生戏刚一看完信就来啦' --> [['邱', '先', '生', '戏', '刚', '一', '看', '完', '信', '就', '来', '啦']]
        else:
            if group:  # group有值的话
                groups.append(group)  # 三维
                group = []  # 每次追加完以后要清空
    if group:  # 有值的话才会加
        groups.append(group)
        group = []

    # 对训练语料问答对的处理,构造Q\A问答句,
    # 假设 a1,a2,a3,三句话  (a1,a2),(a1+a2,a3) ,(a1,a2+a3)
    x_data = []  # 问
    y_data = []  # 答

    for group in tqdm(groups):
        for i, line in enumerate(group):  # 枚举  得到list((0,'wr'), (1,'wqf'))

            # 根据(a1,a2),(a1+a2,a3) ,(a1,a2+a3)这样的组合来构造问答语句

            last_line = None  # 上一行
            if i > 0:  # i>0至少是2行了 0这一行,1这一行
                last_line = group[i - 1]
                if not good_line(last_line):  # 不是一个好句子,就为空
                    last_line = None

            next_line = None  # 下一个行
            if i < len(group) - 1:  # 下一行的边界是len()-1
                next_line = group[i + 1]
                if not good_line(next_line):
                    next_line = None

            next_next_line = None
            if i < len(group) - 2:  # 下下一行的边界是len()-2
                next_next_line = group[i + 2]
                if not good_line(next_next_line):
                    next_next_line = None

            if next_line:  # 下一行存在
                x_data.append(line)
                y_data.append(next_line)
            if last_line and next_line:  # 存在当前最后一行和下一行
                x_data.append(last_line + make_split(last_line) + line)
                y_data.append(next_line)
            if next_line and next_next_line:
                x_data.append(line)
                y_data.append(next_line + make_split(next_line) +
                              next_next_line)

    print(len(x_data), len(y_data))

    # 构建问答
    for ask, answer in zip(x_data[:20], y_data[:20]):  # 就放20个字符
        print(''.join(ask))
        print(''.join(answer))
        print('-' * 20)

    # 生成pkl文件备用
    data = list(zip(x_data, y_data))
    data = [(x, y) for x, y in data
            if limit > len(x) >= x_limit and limit > len(y) >= y_limit
            ]  # 又是一遍过滤

    x_data, y_data = zip(*data)
    ws_input = WordSequence()
    ws_input.fit(x_data + y_data)  # 句子传进去
    print('dump')
    pickle.dump((x_data, y_data), open('chatbot.pkl', 'wb'))

    pickle.dump(ws_input, open('ws.pkl', 'wb'))
    print('done')
Пример #15
0
def main(limit=20, x_limit=3, y_limit=6):
    # 只需要输出小于limit 的句子
    from word_sequence import WordSequence
    print('extract lines')
    fp = open("xiaohuangji50w_fenciA.conv", 'r', errors='ignore', encoding='utf-8')
    groups = []
    group = []
    for line in tqdm(fp):
        # 以M为开始
        if line.startswith('M '):
            line = line.replace('\n', '/') #换成换行
            if '/' in line:
                line = line[2:].split('/')#以/为切分
            else:
                line = list(line[2:])#从第二个开始
            line = line[:-1]
            group.append(list(regular(''.join(line))))
        else:
            if group:
                groups.append(group)
                group = []
    if group:
        groups.append(group)
        group = []
    print('extract group')
    x_data = []#问
    y_data = []# 答
    for group in tqdm(groups):
        for i, line in enumerate(group):#枚举类[(0,句子1),(1,句子2)]
            last_line = None
            if i > 0:
                last_line = group[i - 1]
                if not good_line(last_line):
                    last_line = None
            next_line = None
            if i < len(group) - 1:
                next_line = group[i + 1]
                if not good_line(next_line):
                    next_line = None
            next_next_line = None
            if i < len(group) -2:
                next_next_line = group[i + 2]
                if not good_line(next_next_line):
                    next_next_line = None

            if next_line:
                x_data.append(line)
                y_data.append(next_line)
            if last_line and next_line:
                x_data.append(last_line + make_split(last_line) + line)
                y_data.append(next_line)
            if next_line and next_next_line:
                x_data.append(line)
                y_data.append(next_line + make_split(next_line) + next_next_line)

    print(len(x_data), len(y_data))
    # zip 将数据整合 整合成list 的一个列表
    for ask, answer in zip(x_data[:20], y_data[:20]):
        print(''.join(ask))
        print(''.join(answer))
        print('-'*20)

    data = list(zip(x_data, y_data))
    data = [
        (x, y)
        for x, y in data
        if len(x) < limit \
        and len(y) < limit \
        and len(y) >= y_limit \
        and len(x) >= x_limit
    ]
    x_data, y_data = zip(*data)
    print('fit word_sequence')
    ws_input = WordSequence()
    ws_input.fit(x_data + y_data)
    print('dump')
    pickle.dump(
        (x_data, y_data),
        open('chatbot.pkl', 'wb')
    )
    pickle.dump(ws_input, open('ws.pkl', 'wb'))

    print('done')
Пример #16
0
# write the regularized text into .txt file
# with open('output_regulared.txt','w',encoding='utf-8') as f:
#     for group in groups:
#         for line in group:
#             print(line)
#             # print(type(line))
#             f.write("/".join(line))
#             # f.write('\n')
#             f.write("\n")
#         f.write("\n")


# 如果文件末尾缺少了E,依然可以将最后一组对话加入groups
data_x = []
data_y = []
ws_input = WordSequence()
ws_input.fit(groups)
for group in range(len(groups)):
    for line in range(len(groups[group])):
        groups[group][line] = ws_input.transform(groups[group][line])

for group in groups:
    data_x.append(np.append(group[0],WordSequence.END))
    data_y.append(np.append(group[1],WordSequence.END))
# print(ws_input.dict)
print(data_x[0:5])
print(data_y[0:5])

pickle.dump(
    (data_x, data_y,ws_input),
    open('xiaohuangji_new.pkl', 'wb')
Пример #17
0
def main(limit=20, x_limit=3, y_limit=6):
    """执行程序
    Args:
        limit: 只输出句子长度小于limit的句子
    """
    from word_sequence import WordSequence

    print('load pretrained vec')
    word_vec = pickle.load(open('word_vec.pkl', 'rb'))

    print('extract lines')
    fp = open('dgk_shooter_min.conv', 'r', errors='ignore')
    last_line = None
    groups = []
    group = []
    for line in tqdm(fp):
        if line.startswith('M '):
            line = line.replace('\n', '')
            if '/' in line:
                line = line[2:].split('/')
            else:
                line = list(line[2:])
            line = line[:-1]
            group.append(jieba.lcut(regular(''.join(line))))
        else:  # if line.startswith('E'):
            last_line = None
            if group:
                groups.append(group)
                group = []
    if group:
        groups.append(group)
        group = []
    print('extract groups')
    x_data = []
    y_data = []
    for group in tqdm(groups):
        for i, line in enumerate(group):
            last_line = None
            if i > 0:
                last_line = group[i - 1]
                if not good_line(last_line):
                    last_line = None
            next_line = None
            if i < len(group) - 1:
                next_line = group[i + 1]
                if not good_line(next_line):
                    next_line = None
            next_next_line = None
            if i < len(group) - 2:
                next_next_line = group[i + 2]
                if not good_line(next_next_line):
                    next_next_line = None

            if next_line:
                x_data.append(line)
                y_data.append(next_line)
            # if last_line and next_line:
            #     x_data.append(last_line + make_split(last_line) + line)
            #     y_data.append(next_line)
            # if next_line and next_next_line:
            #     x_data.append(line)
            #     y_data.append(next_line + make_split(next_line) \
            #         + next_next_line)

    print(len(x_data), len(y_data))
    for ask, answer in zip(x_data[:20], y_data[:20]):
        print(''.join(ask))
        print(''.join(answer))
        print('-' * 20)

    data = list(zip(x_data, y_data))
    data = [
        (x, y)
        for x, y in data
        if len(x) < limit \
        and len(y) < limit \
        and len(y) >= y_limit \
        and len(x) >= x_limit
    ]
    x_data, y_data = zip(*data)

    print('refine train data')

    train_data = x_data + y_data

    # good_train_data = []
    # for line in tqdm(train_data):
    #     good_train_data.append([
    #         x for x in line
    #         if x in word_vec
    #     ])
    # train_data = good_train_data

    print('fit word_sequence')

    ws_input = WordSequence()

    ws_input.fit(train_data, max_features=100000)

    print('dump word_sequence')

    pickle.dump((x_data, y_data, ws_input), open('chatbot.pkl', 'wb'))

    print('make embedding vecs')

    emb = np.zeros((len(ws_input), len(word_vec['</s>'])))

    np.random.seed(1)
    for word, ind in ws_input.dict.items():
        if word in word_vec:
            emb[ind] = word_vec[word]
        else:
            emb[ind] = np.random.random(size=(300, )) - 0.5

    print('dump emb')

    pickle.dump(emb, open('emb.pkl', 'wb'))

    print('done')
Пример #18
0
def main(limit=20, x_limit=3, y_limit=6):
    """
    Args:
        limit: 只輸出長度小於limit的句子
    """
    from word_sequence import WordSequence
    print('load pretrained vec')
    word_vec = pickle.load(open('./pickle/word_vec.pkl', 'rb'))

    print('extract lines')
    fp = open('./data/replaced_data.txt', 'r', errors='ignore')
    # last_line = None
    groups = []
    group = []
    for line in tqdm(fp):
        if line.startswith('M '):
            line = line.replace('\n', '')
            if '/' in line:
                line = line[2:].split('/')
            else:
                line = line[2:]

            outline = jieba.lcut(regular(''.join(line)))

            group.append(outline)
        else:  # if line.startswith('E'):
            last_line = None
            if group:
                groups.append(group)
                group = []
    if group:
        groups.append(group)
        group = []
    print('extract groups')
    x_data = []
    y_data = []
    for group in tqdm(groups):
        for i, line in enumerate(group):
            next_line = None
            if i + 1 >= len(group):
                continue
            if i % 2 == 0:
                next_line = group[i + 1]

            if next_line:
                x_data.append(line)
                y_data.append(next_line)

    x_f = open('./data/x_data.txt', 'w')
    y_f = open('./data/y_data.txt', 'w')
    for i in range(len(x_data) - 1):
        # x_line = x_data[i]
        # x_line = x_line[:-2]
        x_out = ''.join(list(x_data[i]))
        y_out = ''.join(list(y_data[i]))
        x_f.write(x_out + '\n')
        y_f.write(y_out + '\n')
    print(len(x_data), len(y_data))
    # exit()
    for ask, answer in zip(x_data[:20], y_data[:20]):
        print(''.join(ask))
        print(''.join(answer))
        print('-' * 20)

    data = list(zip(x_data, y_data))
    data = [(x, y) for x, y in data if len(x) < limit and len(y) < limit
            and len(y) >= y_limit and len(x) >= x_limit]
    x_data, y_data = zip(*data)

    print('refine train data')

    train_data = x_data + y_data

    print('fit word_sequence')

    ws_input = WordSequence()

    ws_input.fit(train_data, max_features=100000)

    print('dump word_sequence')

    pickle.dump((x_data, y_data, ws_input), open('./pickle/chatbot.pkl', 'wb'))

    print('make embedding vecs')

    emb = np.zeros((len(ws_input), len(word_vec['</s>'])))

    np.random.seed(1)
    for word, ind in ws_input.dict.items():
        if word in word_vec:
            emb[ind] = word_vec[word]
        else:
            emb[ind] = np.random.random_sample(size=(300, )) - 0.5

    print('dump emb')

    pickle.dump(emb, open('./pickle/emb.pkl', 'wb'))

    print('done')
Пример #19
0
def main(limit=20, x_limit=3, y_limit=6):
    from word_sequence import WordSequence
    #读入文件
    print('extract lines')
    fp = open("xiaohuangji50w_fenciA.conv",
              'r',
              errors='ignore',
              encoding='utf-8')
    groups = []
    group = []
    #使用tqdm解压文件
    for line in tqdm(fp):
        if line.startswith('M '):
            line = line.replace('\n', '')

            if '/' in line:
                line = line[2:].split('/')
            else:
                line = list(line[2:])
            line = line[:-1]

            group.append(list(regular(''.join(line))))
        else:
            if group:
                groups.append(group)
                group = []
    if group:
        groups.append(group)
        group = []
    print('extract group')
    #tqdm解压完成

    #分割x,y作为问和答
    x_data = []
    y_data = []
    for group in tqdm(groups):
        #使用枚举
        for i, line in enumerate(group):
            last_line = None
            if i > 0:
                last_line = group[i - 1]
                if not good_line(last_line):
                    last_line = None
            next_line = None
            if i < len(group) - 1:
                next_line = group[i + 1]
                if not good_line(next_line):
                    next_line = None
            next_next_line = None
            if i < len(group) - 2:
                next_next_line = group[i + 2]
                if not good_line(next_next_line):
                    next_next_line = None

            if next_line:
                x_data.append(line)
                y_data.append(next_line)
            if last_line and next_line:
                x_data.append(last_line + make_split(last_line) + line)
                y_data.append(next_line)
            if next_line and next_next_line:
                x_data.append(line)
                y_data.append(next_line + make_split(next_line) +
                              next_next_line)

    print(len(x_data), len(y_data))
    #把x,y转换成问和答
    for ask, answer in zip(x_data[:20], y_data[:20]):
        print(''.join(ask))
        print(''.join(answer))
        print('-' * 20)


#组装x和y
    data = list(zip(x_data, y_data))
    data = [
        (x, y)
        for x, y in data
        if len(x) < limit \
        and len(y) < limit \
        and len(y) >= y_limit \
        and len(x) >= x_limit
    ]
    x_data, y_data = zip(*data)

    print('fit word_sequence')
    #用pickle保存数据模型
    ws_input = WordSequence()
    ws_input.fit(x_data + y_data)

    print('dump')

    pickle.dump((x_data, y_data), open('chatbot_xhj.pkl', 'wb'))
    pickle.dump(ws_input, open('ws_xhj.pkl', 'wb'))

    print('done')
Пример #20
0
def main(limit=20, x_limit=3, y_limit=6):
    """执行程序
    Args:
        limit: 只输出句子长度小于limit的句子
    """
    from word_sequence import WordSequence

    print('load pretrained vec')
    word_vec = pickle.load(open('word_vec.pkl', 'rb'))

    print('extract lines')
    fp = open('zhihu.csv', 'r', errors='ignore', encoding='utf-8')
    x_data = []
    y_data = []
    i = 0
    for line in tqdm(fp):
        #i+=1
        #if(i>10000):
        #    break
        line = line.replace('\n', '')
        x, y = line.split(',')
        x = x.split(' ')
        y = y.split(' ')
        x_data.append(x)
        y_data.append(y)

    print(len(x_data), len(y_data))
    for ask, answer in zip(x_data[:20], y_data[:20]):
        print(''.join(ask))
        print(''.join(answer))
        print('-' * 20)

    data = list(zip(x_data, y_data))
    data = [
        (x, y)
        for x, y in data
        if len(x) < limit \
        and len(y) < limit \
        and len(y) >= y_limit \
        and len(x) >= x_limit
    ]
    x_data, y_data = zip(*data)

    print('refine train data')

    train_data = x_data + y_data

    print('fit word_sequence')

    ws_input = WordSequence()

    ws_input.fit(train_data, max_features=100000)

    print('dump word_sequence')

    pickle.dump((x_data, y_data, ws_input), open('chatbot.pkl', 'wb'))

    print('make embedding vecs')

    emb = np.zeros((len(ws_input), len(word_vec['</s>'])))

    np.random.seed(1)
    for word, ind in ws_input.dict.items():
        if word in word_vec:
            emb[ind] = word_vec[word]
        else:
            emb[ind] = np.random.random(size=(300, )) - 0.5

    print('dump emb')

    pickle.dump(emb, open('emb.pkl', 'wb'))

    print('done')
Пример #21
0
def main(limit=15):
    """执行程序
    Args:
        limit: 只输出句子长度小于limit的句子
    """
    from word_sequence import WordSequence

    x_data, y_data = [], []
    tree = ET.parse('en-zh_cn.tmx')
    root = tree.getroot()
    body = root.find('body')
    for tu in tqdm(body.findall('tu')):
        en = ''
        zh = ''
        for tuv in tu.findall('tuv'):
            if list(tuv.attrib.values())[0] == 'en':
                en += tuv.find('seg').text
            elif list(tuv.attrib.values())[0] == 'zh_cn':
                zh += tuv.find('seg').text

        if en and zh:
            x_data.append(en)
            y_data.append(zh)

    print(len(x_data))

    print(x_data[:10])
    print(y_data[:10])

    print('tokenize')

    def en_tokenize(text):
        # text = re.sub('[\((][^\))]+[\))]', '', text)
        return nltk.word_tokenize(text.lower())

    x_data = [en_tokenize(x) for x in tqdm(x_data)]

    def zh_tokenize(text):
        # text = text.replace(',', ',')
        # text = text.replace('。', '.')
        # text = text.replace('?', '?')
        # text = text.replace('!', '!')
        # text = text.replace(':', ':')
        # text = re.sub(r'[^\u4e00-\u9fff,\.\?\!…《》]:', '', text)
        # text = text.strip()
        text = jieba.lcut(text.lower())
        return text

    y_data = [zh_tokenize(y) for y in tqdm(y_data)]

    data = list(zip(x_data, y_data))
    data = [(x, y) for x, y in data if len(x) < limit and len(y) < limit]
    x_data, y_data = zip(*data)

    print(x_data[:10])
    print(y_data[:10])

    print(len(x_data), len(y_data))

    print('fit word_sequence')

    ws_input = WordSequence()
    ws_target = WordSequence()
    ws_input.fit(x_data)
    ws_target.fit(y_data)

    print('dump')

    pickle.dump((x_data, y_data, ws_input, ws_target),
                open('en-zh_cn.pkl', 'wb'))

    print('done')
Пример #22
0
def main(limit=20, x_limit=3, y_limit=6):
    print("extract lines")

    fp = open("./data/dgk_shooter_min.conv", 'r', encoding='utf-8')
    groups = []
    group = []

    for line in tqdm(fp):
        if line.startswith('M '):
            line = line.replace('\n', '')
            if '/' in line:
                line = line[2:].split('/')
            else:
                line = list(line[2:])
            line = line[:-1]  #

            group.append(list(regular(''.join(line))))
        else:
            lsat_line = None
            if group:
                groups.append(group)
                group = []
    print('extract group')

    x_data = []
    y_data = []

    for group in tqdm(groups):
        for i, line in enumerate(group):
            last_line = None
            if i > 0:
                last_line = group[i - 1]
                if not good_line(last_line):
                    lsat_line = None

            next_line = None
            if i < len(group) - 1:
                next_line = group[i + 1]
                if not good_line(next_line):
                    next_line = None
            next_next_line = None
            if i < len(group) - 2:
                next_next_line = group[i + 2]
                if not good_line(next_next_line):
                    next_next_line = None

            if next_line:
                if checkLimit(line, next_line, limit, x_limit, y_limit):
                    x_data.append(line)
                    y_data.append(next_line)

            if last_line and next_line:
                a = last_line + make_split(last_line) + line
                b = next_line
                if checkLimit(a, b, limit, x_limit, y_limit):
                    x_data.append(a)
                    y_data.append(b)

            if next_line and next_next_line:
                a = line
                b = next_line + make_split(next_line) + next_next_line
                if checkLimit(a, b, limit, x_limit, y_limit):
                    x_data.append(a)
                    y_data.append(b)

    x_len = len(x_data)
    y_len = len(y_data)
    print(x_len, y_len)

    for ask, answer in zip(x_data[:20], y_data[:20]):
        print(''.join(ask))
        print(''.join(answer))
        print('-' * 20)
    """
    print("listing zip...")
    data = list(zip(x_data,y_data))

    print("fixing data...")
    data = [
        (x,y)
        for x,y in data
        if len(x) <limit \
        and len(y) < limit \
        and len(y)>=y_limit \
        and len(x)>=x_limit

    ]

    print("rezipping data...")
    x_data,y_data=zip(*data)
    """
    """
    print('fit word_sequence...')
    for k in tqdm(range(x_len-1, -1, -1)):
        x = x_data[k]
        y = y_data[k]
        xLen = len(x)
        yLen = len(y)
        if not (xLen < limit and yLen < limit and yLen >= y_limit and xLen >= x_limit):
            old = id(x_data)
            x_data.remove(x)
            y_data.remove(y)
            print("elem removed, x_data address delta:", old, id(x_data))
    """

    print('fit word_sequence..done')

    ws_input = WordSequence()
    ws_input.fit(x_data + y_data)

    print('dump')

    pickle.dump((x_data, y_data), open('./data/chatbot.pkl', 'wb'))
    pickle.dump(ws_input, open('./data/ws.pkl', 'wb'))
    print('done')
Пример #23
0
def main(limit=20, x_limit=3, y_limit=6):
    from word_sequence import WordSequence

    print('extract lines')
    fp = open('dataset/dgk_shooter_min.conv',
              'r',
              errors='ignore',
              encoding='utf-8')
    groups = []
    group = []

    # tqdm 进度条库
    for line in tqdm(fp):
        if line.startswith('M '):
            line = line.replace('\n', '')
            if '/' in line:
                line = line[2:].split('/')
            else:
                line = list(line[2:])
            line = line[:-1]
            # 每行去除加入大数组里
            group.append(list(regular(''.join(line))))
        else:
            if group:
                groups.append(group)
                group = []
    if group:
        groups.append(group)
        group = []

    print('extract group')

    x_data = []
    y_data = []

    for group in tqdm(groups):
        for i, line in enumerate(group):  # M i;畹/华/吾/侄/ line

            last_line = None  # last_line next_line next_next_line问答操作
            if i > 0:  # group至少两行0 1
                last_line = group[i - 1]  # 获取最后一行
                if not good_line(last_line):
                    last_line = None
            next_line = None
            if i < len(group) - 1:
                next_line = group[i + 1]
                if not good_line(next_line):
                    next_line = None

            next_next_line = None
            if i < len(group) - 2:
                next_next_line = group[i + 2]
                if not good_line(next_next_line):
                    next_next_line = None

            if next_line:  # 第一行数据存在下一行
                x_data.append(line)  # x赋第一行
                y_data.append(next_line)  # y赋第二行
            if last_line and next_line:
                x_data.append(last_line + make_split(last_line) + line)
                y_data.append(next_line)
            if next_line and next_next_line:
                x_data.append(line)
                y_data.append(next_line + make_split(next_line) +
                              next_next_line)

    print(len(x_data), len(y_data))

    # 构建问答
    for ask, answer in zip(x_data[:20], y_data[:20]):
        print(''.join(ask))
        print(''.join(answer))
        print('-' * 20)

    # 生成pkl文件备用
    data = list(zip(x_data, y_data))
    data = [(x, y) for x, y in data
            if limit > len(x) >= x_limit and limit > len(y) >= y_limit]

    x_data, y_data = zip(*data)
    ws_input = WordSequence()
    ws_input.fit(x_data + y_data)
    print('dump')
    pickle.dump((x_data, y_data), open('chatbot.pkl', 'wb'))
    pickle.dump(ws_input, open('ws.pkl', 'wb'))
    print('done')
Пример #24
0
def main(limit=20, x_limit=3, y_limit=6):#一个句子最多只能有20个字符
    #解压文件
    print('extract lines')#
    fp = open('dgk_shooter_min_test.conv', 'r', errors='ignore', encoding='utf-8')

    #去掉M空格,和斜杠,丢弃,无效的句子
    groups = []
    group = []
    for line in tqdm(fp):
        if line.startswith('M '): #表示这一行是句子,需要处理
            line = line.replace('\n', '')
            if '/' in line:#如果这一行有'/',我们就从第二个字符开始以'/'切分
                line = line[2:].split('/')
            else:
                line = list(line[2:])
            line = line[:-1]

            group.append(list(regular(''.join(line))))
        else:
            if group:
                groups.append(group)
                group = []
    if group:
        groups.append(group)
        group = []
    print('extract group')


    # 定义问答对
    x_data = []
    y_data = []
    for group in tqdm(groups):#tqdm可以显示进度
        for i, line in enumerate(group): #把他化为枚举类
            last_line = None
            if i > 0:#表示最少有两行
                last_line = group[i-1]
                if not good_line(last_line):
                    last_line = None
            next_line = None
            if i < len(group) - 1:
                next_line = group[i+1]
                if not good_line(next_line):
                    next_line = None

            next_next_line = None
            if i < len(group) - 2:
                next_next_line = group[i + 2]
                if not good_line(next_next_line):
                    next_next_line = None
            if next_line:
                x_data.append(line)
                y_data.append(next_line)
            if last_line and next_line:
                x_data.append(last_line + make_split(last_line) + line)
                y_data.append(next_line)
            if next_line and next_next_line:
                x_data.append(line)
                y_data.append(next_line + make_split(next_line) + next_next_line)

    print(len(x_data), len(y_data))


    # # 构建问答对,进行测试
    # for ask, answer in zip(x_data[:-1], y_data[:-1]):
    #     print(''.join(ask))
    #     print(''.join(answer))
    #     print('-'*20)


    # 生成pkl文件备用
    data = list(zip(x_data, y_data))
    data = [
        (x, y) for x, y in data if limit > len(x) >= x_limit and limit > len(y) >= y_limit
    ]
    x_data, y_data = zip(*data)
    
    ## 训练以及词向量模型保存
    ws_input = WordSequence()
    ws_input.fit(x_data + y_data)
    print('dump')
    pickle.dump((x_data, y_data), open('chatbot_test.pkl', 'wb'))
    pickle.dump(ws_input, open('ws_test.pkl', 'wb'))
    print('done')
Пример #25
0
def main(limit=20, x_limit=3, y_limit=6):

    print('extract lines')
    fp = open("dgk_shooter_min.conv", 'r', errors='ignore', encoding='utf-8')
    groups = []
    group = []

    for line in tqdm(fp):
        #tqdm(iterator) 进度条库
        if line.startswith('M '):
            line = line.replace('\n', '')

            if '/' in line:
                line = line[2:].split('/')
            else:
                line = list(line[2:])
            line = line[:-1]  #对话行列表

            group.append(list(regular(''.join(line))))  # 对话组
        else:  # 逐行读入,非'M'开头,一组对话结束
            if group:
                groups.append(group)  # 对话组列表
                group = []
    if group:
        groups.append(group)
        group = []
    print('extract group')

    x_data = []
    y_data = []
    for group in tqdm(groups):
        for i, line in enumerate(group):
            last_line = None
            if i > 0:
                last_line = group[i - 1]
                if not good_line(last_line):
                    last_line = None
            next_line = None
            if i < len(group) - 1:
                next_line = group[i + 1]
                if not good_line(next_line):
                    next_line = None
            next_next_line = None
            if i < len(group) - 2:
                next_next_line = group[i + 2]
                if not good_line(next_next_line):
                    next_next_line = None

            if next_line:
                x_data.append(line)
                y_data.append(next_line)
            if last_line and next_line:
                x_data.append(last_line + make_split(last_line) + line)
                y_data.append(next_line)
            if next_line and next_next_line:
                x_data.append(line)
                y_data.append(next_line + make_split(next_line) +
                              next_next_line)

    print(len(x_data), len(y_data))

    for ask, answer in zip(x_data[:20], y_data[:20]):
        print(''.join(ask))
        print(''.join(answer))
        print('-' * 20)

    data = list(zip(x_data, y_data))
    data = [
        (x, y)
        for x, y in data
        if len(x) < limit \
        and len(y) < limit \
        and len(y) >= y_limit \
        and len(x) >= x_limit
    ]
    x_data, y_data = zip(*data)

    print('fit word_sequence')

    ws_input = WordSequence()
    ws_input.fit(x_data + y_data)

    print('dump')

    pickle.dump((x_data, y_data), open('chatbot.pkl', 'wb'))
    pickle.dump(ws_input, open('ws.pkl', 'wb'))

    print('done')
Пример #26
0
def main(limit=20, x_limit=3, y_limit=6):
    """执行程序
    Args:
        limit: 只输出句子长度小于limit的句子
    """
    from word_sequence import WordSequence

    print('extract lines')
    fp = open('dgk_shooter_min.conv', 'r', errors='ignore')
    last_line = None
    groups = []
    group = []
    for line in tqdm(fp):
        if line.startswith('M '):
            line = line.replace('\n', '')
            if '/' in line:
                line = line[2:].split('/')
            else:
                line = list(line[2:])
            line = line[:-1]
            group.append(list(regular(''.join(line))))
        else:  # if line.startswith('E'):
            last_line = None
            if group:
                groups.append(group)
                group = []
    if group:
        groups.append(group)
        group = []
    print('extract groups')
    x_data = []
    y_data = []
    for group in tqdm(groups):
        for i, line in enumerate(group):
            last_line = None
            if i > 0:
                last_line = group[i - 1]
                if not good_line(last_line):
                    last_line = None
            next_line = None
            if i < len(group) - 1:
                next_line = group[i + 1]
                if not good_line(next_line):
                    next_line = None
            next_next_line = None
            if i < len(group) - 2:
                next_next_line = group[i + 2]
                if not good_line(next_next_line):
                    next_next_line = None

            if next_line:
                x_data.append(line)
                y_data.append(next_line)
            if last_line and next_line:
                x_data.append(last_line + make_split(last_line) + line)
                y_data.append(next_line)
            if next_line and next_next_line:
                x_data.append(line)
                y_data.append(next_line + make_split(next_line) \
                    + next_next_line)

    print(len(x_data), len(y_data))
    for ask, answer in zip(x_data[:20], y_data[:20]):
        print(''.join(ask))
        print(''.join(answer))
        print('-' * 20)

    data = list(zip(x_data, y_data))
    data = [
        (x, y)
        for x, y in data
        if len(x) < limit \
        and len(y) < limit \
        and len(y) >= y_limit \
        and len(x) >= x_limit
    ]
    x_data, y_data = zip(*data)

    print('fit word_sequence')

    ws_input = WordSequence()
    ws_input.fit(x_data + y_data)

    print('dump')

    pickle.dump((x_data, y_data), open('chatbot.pkl', 'wb'))
    pickle.dump(ws_input, open('ws.pkl', 'wb'))

    print('done')
Пример #27
0
def main(limit=20, x_limit=3, y_limit=6):#最大长度为20,句子长度
    #创建的新的类,句子编码化处理,导入word_sequence,自己创建的类导入时要用from xx import 文件名
    from word_sequence import WordSequence

    print('extract lines') #解压文件
    #打开文件,第一参数是文件名,第二参数是读,第三参数是错误忽略,第四参数是解码类型。
    fp = open("chat_data1.conv", 'r', errors='ignore', encoding='utf-8')
    groups = []
    group = []
    #tqdm进度条,传进进度条方法,用进度条方法来显示。
    for line in tqdm(fp):
        #行开始以M开始
        if line.startswith('M '):
            #让这一行的M用空符替代。去掉M
            line = line.replace('\n', '')
            #去掉斜杠,从第二行开始就以斜杠来切分
            if '/' in line:
                line = line[2:].split('/')
            #无斜杠就直接读line
            else:
                line = list(line[2:])
            #去掉这一行最后一个字符。
            line = line[:-1]
            #得到一个group结果
            group.append(list(regular(''.join(line))))
        #以E开头的结果
        else:
            if group:
                groups.append(group)
                group = []
    if group:
        groups.append(group)
        group = []
    print('extract group')
    #解压完成。已经存入了groups中
    x_data = []
    y_data = []
    #语料对处理
    for group in tqdm(groups):
        for i, line in enumerate(group):#会有两个返回值,一个是数字,还有一个是句子。i和line进行对应。
            #定义最后一行为空
            last_line = None
            if i > 0: #代表至少有两行
                last_line = group[i - 1]#拿到最后一行
                if not good_line(last_line):#判断是否好句子。
                    last_line = None
            #设置下一行也为空
            next_line = None
            if i < len(group) - 1:
                next_line = group[i + 1]
                if not good_line(next_line):
                    next_line = None
            # 设置下下一行也为空
            next_next_line = None
            if i < len(group) -2:
                next_next_line = group[i + 2]
                if not good_line(next_next_line):
                    next_next_line = None
            #处理问答对,X和Y分别代表问答。
            if next_line:
                x_data.append(line)#把第一行设为问
                y_data.append(next_line)#最后一行设为答。
            if last_line and next_line:
                x_data.append(last_line + make_split(last_line) + line)
                y_data.append(next_line)
            if next_line and next_next_line:
                x_data.append(line)
                y_data.append(next_line + make_split(next_line) + next_next_line)

    print(len(x_data), len(y_data))
    #输出一部分问答对
    for ask, answer in zip(x_data[:20], y_data[:20]):#zip实际上python自带的命令,进行数据的整合,整合为list
        print(''.join(ask))
        print(''.join(answer))
        print('-'*20)


    data = list(zip(x_data, y_data))
    #数组类型,用{}变成字典类型,
    data = [
        (x, y)
        for x, y in data
        #limit为20,句子长度。
        if len(x) < limit \
        and len(y) < limit \
        #最短长度。
        and len(y) >= y_limit \
        and len(x) >= x_limit
    ]
    x_data, y_data = zip(*data)

    print('fit word_sequence')

    ws_input = WordSequence()
    #传进WordSequence中,词转换为向量。
    ws_input.fit(x_data + y_data)

    print('dump')
    #pickle.dump(obj, file[, protocol])序列化对象,并将结果数据流写入到文件对象中。参数protocol是序列化模式,默认值为0,表示以文本的形式序列化。protocol的值还可以是1或2,表示以二进制的形式序列化。

   #保存chatbot和ws两个文件。
    pickle.dump(
        (x_data, y_data),
        open('chatbot.pkl', 'wb')
    )
    pickle.dump(ws_input, open('ws.pkl', 'wb'))

    print('done')