Пример #1
0
def test_bleu(count):
    """测试bleu, 这个方法我们不看"""
    print("bleu test mode")
    from nltk.translate.bleu_score import sentence_bleu
    from tqdm import tqdm
    # 准备数据
    print('准备数据')
    bucket_dbs = data_utils.read_bucket_dbs(FLAGS.buckets_dir)
    bucket_sizes = []
    for i in range(len(buckets)):
        bucket_size = bucket_dbs[i].size
        bucket_sizes.append(bucket_size)
        print('bucket {} 中有数据 {} 条'.format(i, bucket_size))
    total_size = sum(bucket_sizes)
    print('共有数据 {} 条'.format(total_size))
    # bleu设置0的话,默认对所有样本采样
    if count <= 0:
        count = total_size
    buckets_scale = [
        sum(bucket_sizes[:i + 1]) / total_size
        for i in range(len(bucket_sizes))
    ]
    with tf.Session() as sess:
        #  构建模型
        model = create_model(sess, True)
        model.batch_size = 1
        # 初始化变量
        sess.run(tf.initialize_all_variables())
        model.saver.restore(sess,
                            os.path.join(FLAGS.model_dir, FLAGS.model_name))

        total_score = 0.0
        for i in tqdm(range(count)):
            # 选择一个要训练的bucket
            random_number = np.random.random_sample()
            bucket_id = min([
                i for i in range(len(buckets_scale))
                if buckets_scale[i] > random_number
            ])
            data, _ = model.get_batch_data(bucket_dbs, bucket_id)
            encoder_inputs, decoder_inputs, decoder_weights = model.get_batch(
                bucket_id, data)
            _, _, output_logits = model.step(sess, encoder_inputs,
                                             decoder_inputs, decoder_weights,
                                             bucket_id, True)
            outputs = [
                int(np.argmax(logit, axis=1)) for logit in output_logits
            ]
            ask, _ = data[0]
            all_answers = bucket_dbs[bucket_id].all_answers(ask)
            ret = data_utils.indice_sentence(outputs)
            if not ret:
                continue
            references = [list(x) for x in all_answers]
            score = sentence_bleu(references, list(ret), weights=(1.0, ))
            total_score += score
        print('BLUE: {:.2f} in {} samples'.format(total_score / count * 10,
                                                  count))
Пример #2
0
def test_bleu(count):
    u'测试bleu'
    from nltk.translate.bleu_score import sentence_bleu
    from tqdm import tqdm
    print(u'准备数据')
    bucket_dbs = data_utils.read_bucket_dbs(
        FLAGS.buckets_dir)  #FLAGS.buckets_dir
    bucket_sizes = []
    for i in range(len(buckets)):
        bucket_size = bucket_dbs[i].size
        bucket_sizes.append(bucket_size)
        print(u'bucket {} 中有数据 {} 条'.format(i, bucket_size))
    total_size = sum(bucket_sizes)
    print(u'共有数据 {} 条'.format(total_size))
    if (count <= 0):
        count = total_size
    buckets_scale = [(sum(bucket_sizes[:(i + 1)]) / total_size)
                     for i in range(len(bucket_sizes))]
    with tf.Session() as sess:
        model = create_model(sess, True)
        model.batch_size = 1
        sess.run(tf.initialize_all_variables())
        model.saver.restore(sess,
                            os.path.join(FLAGS.model_dir, FLAGS.model_name))
        total_score = 0.0
        for i in tqdm(range(count)):
            random_number = np.random.random_sample()
            bucket_id = min([
                i for i in range(len(buckets_scale))
                if (buckets_scale[i] > random_number)
            ])
            (data, _) = model.get_batch_data(bucket_dbs, bucket_id)
            (encoder_inputs, decoder_inputs,
             decoder_weights) = model.get_batch(bucket_dbs, bucket_id, data)
            (_, _, output_logits) = model.step(sess, encoder_inputs,
                                               decoder_inputs, decoder_weights,
                                               bucket_id, True)
            outputs = [
                int(np.argmax(logit, axis=1)) for logit in output_logits
            ]
            (ask, _) = data[0]
            all_answers = bucket_dbs[bucket_id].all_answers(ask)
            ret = data_utils.indice_sentence(outputs)
            if (not ret):
                continue
            references = [list(x) for x in all_answers]
            score = sentence_bleu(references, list(ret), weights=(1.0, ))
            total_score += score
        print(u'BLUE: {:.2f} in {} samples'.format(
            ((total_score / count) * 10), count))
Пример #3
0
def train():
    """训练模型"""
    # 准备数据
    print("train mode")
    print('准备数据')
    if not os.path.exists(FLAGS.model_dir):
        os.makedirs(FLAGS.model_dir)
    bucket_dbs = data_utils.read_bucket_dbs(FLAGS.buckets_dir)
    bucket_sizes = []
    for i in range(len(buckets)):
        bucket_size = bucket_dbs[i].size
        bucket_sizes.append(bucket_size)
        print('bucket {} 中有数据 {} 条'.format(i, bucket_size))
    total_size = sum(bucket_sizes)
    print('共有数据 {} 条'.format(total_size))
    # 开始建模与训练
    with tf.Session(config=tf.ConfigProto(allow_soft_placement=False,
                                          gpu_options=gpu_options)) as sess:
        # 构建模型
        model = create_model(sess, False)
        # 初始化变量
        sess.run(tf.initialize_all_variables())
        ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
        #print("ckpt path : ", ckpt.model_checkpoint_path)
        if ckpt != None:
            print("读取模型 : ", ckpt.model_checkpoint_path)
            model.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            print("not exist old model")
        buckets_scale = [
            sum(bucket_sizes[:i + 1]) / total_size
            for i in range(len(bucket_sizes))
        ]
        # 开始训练
        metrics = '  '.join(
            ['\r[{}]', '{:.1f}%', '{}/{}', 'loss={:.3f}', '{}/{}'])
        bars_max = 20
        with tf.device('/gpu:0'):
            for epoch_index in range(1, FLAGS.num_epoch + 1):
                print('Epoch {}:'.format(epoch_index))
                time_start = time.time()
                epoch_trained = 0
                batch_loss = []
                while True:
                    # 选择一个要训练的bucket
                    random_number = np.random.random_sample()
                    bucket_id = min([
                        i for i in range(len(buckets_scale))
                        if buckets_scale[i] > random_number
                    ])
                    data, data_in = model.get_batch_data(bucket_dbs, bucket_id)
                    encoder_inputs, decoder_inputs, decoder_weights = model.get_batch(
                        bucket_dbs, bucket_id, data)
                    _, step_loss, output = model.step(sess, encoder_inputs,
                                                      decoder_inputs,
                                                      decoder_weights,
                                                      bucket_id, False)
                    epoch_trained += FLAGS.batch_size
                    batch_loss.append(step_loss)
                    time_now = time.time()
                    time_spend = time_now - time_start
                    time_estimate = time_spend / (epoch_trained /
                                                  FLAGS.num_per_epoch)
                    percent = min(100,
                                  epoch_trained / FLAGS.num_per_epoch) * 100
                    bars = math.floor(percent / 100 * bars_max)
                    sys.stdout.write(
                        metrics.format('=' * bars + '-' * (bars_max - bars),
                                       percent,
                                       epoch_trained, FLAGS.num_per_epoch,
                                       np.mean(batch_loss),
                                       data_utils.time(time_spend),
                                       data_utils.time(time_estimate)))
                    sys.stdout.flush()
                    if epoch_trained >= FLAGS.num_per_epoch:
                        model.saver.save(sess,
                                         os.path.join(FLAGS.model_dir,
                                                      FLAGS.model_name),
                                         global_step=epoch_index)
                        break
                print('\n')
Пример #4
0
def train():
    u'训练模型'
    print(u'准备数据')

    bucket_dbs = data_utils.read_bucket_dbs(
        FLAGS.buckets_dir)  #FLAGS.buckets_dir
    bucket_sizes = []
    for i in range(len(buckets)):
        bucket_size = bucket_dbs[i].size
        bucket_sizes.append(bucket_size)
        print(u'bucket {} 中有数据 {} 条'.format(i, bucket_size))
    total_size = sum(bucket_sizes)
    print(u'共有数据 {} 条'.format(total_size))
    with tf.Session() as sess:
        model = create_model(sess, False)
        sess.run(tf.initialize_all_variables())
        buckets_scale = [(sum(bucket_sizes[:(i + 1)]) / total_size)
                         for i in range(len(bucket_sizes))]
        metrics = u'  '.join(
            [u'\r[{}]', u'{:.1f}%', u'{}/{}', u'loss={:.3f}', u'{}/{}'])
        bars_max = 20
        for epoch_index in range(1, (FLAGS.num_epoch + 1)):
            print(u'Epoch {}:'.format(epoch_index))
            time_start = time.time()
            epoch_trained = 0
            batch_loss = []
            while True:
                random_number = np.random.random_sample()
                bucket_id = min([
                    i for i in range(len(buckets_scale))
                    if (buckets_scale[i] > random_number)
                ])
                (data, data_in) = model.get_batch_data(bucket_dbs, bucket_id)
                (encoder_inputs, decoder_inputs,
                 decoder_weights) = model.get_batch(bucket_dbs, bucket_id,
                                                    data)
                (_, step_loss,
                 output) = model.step(sess, encoder_inputs, decoder_inputs,
                                      decoder_weights, bucket_id, False)
                epoch_trained += FLAGS.batch_size
                batch_loss.append(step_loss)
                time_now = time.time()
                time_spend = (time_now - time_start)
                time_estimate = (time_spend /
                                 (epoch_trained / FLAGS.num_per_epoch))
                percent = (min(100,
                               (epoch_trained / FLAGS.num_per_epoch)) * 100)
                bars = math.floor(((percent / 100) * bars_max))
                #sys.stdout.write(metrics.format(((u'=' * bars) + (u'-' * (bars_max - bars))), float(percent), epoch_trained,
                #                                FLAGS.num_per_epoch, np.mean(batch_loss), data_utils.time(time_spend), data_utils.time(time_estimate)))
                sys.stdout.flush()
                if (epoch_trained >= FLAGS.num_per_epoch):
                    break
            print(u'\n')
        if (not os.path.exists(FLAGS.model_dir)):
            os.makedirs(FLAGS.model_dir)
        parser = argparse.ArgumentParser()
        parser.add_argument('--model_dir',
                            type=str,
                            default='',
                            help='ouput data puth')
        ARGS = None
        ARGS, _ = parser.parse_known_args()
        model_path = ARGS.model_dir
        save_path = os.path.join(model_path, "model1")
        model.saver.save(sess, save_path)
Пример #5
0
def train():
    # 流程
    # 1.数据预处理
    # 2.seq2seq

    # ========================================================
    # 准备数据
    print("train mode.......")
    print('准备数据')
    if not os.path.exists(FLAGS.model_dir):
        os.makedirs(FLAGS.model_dir)

    # 数据预处理
    # buckets_dir 训练数据目录
    # 获取列表,列表里是四个桶,每个桶里有各自的数据库内容
    bucket_dbs = data_utils.read_bucket_dbs(FLAGS.buckets_dir)
    # 每个桶的数据量添加
    bucket_sizes = []
    for i in range(len(buckets)):
        # 语句的尺寸
        bucket_size = bucket_dbs[i].size  # 不同的桶的数据量
        bucket_sizes.append(bucket_size)
        print('bucket {} 中有数据 {} 条'.format(i, bucket_size))
    # 所有样本的数目
    total_size = sum(bucket_sizes)  # 获取所有的样本数目
    print('共有数据 {} 条'.format(total_size))

    # 开始建模与训练
    gpu_options = tf.GPUOptions(
        allow_growth=True,  # 允许GPU分配是一种增量分配的方式
        per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction))

    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                          gpu_options=gpu_options)) as sess:
        #  构建模型(每个桶对应一个训练对象\损失函数\summary相关信息;但是这四套代码是参数共享的)
        model = create_model(sess, False)

        # 初始化变量&模型恢复
        print("开始进行模型初始化以及模型恢复操作.....")
        sess.run(tf.global_variables_initializer())
        ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
        if ckpt and ckpt.model_checkpoint_path:
            print("Load old model from : ", ckpt.model_checkpoint_path)
            model.saver.restore(sess, ckpt.model_checkpoint_path)
            model.saver.recover_last_checkpoints(
                ckpt.all_model_checkpoint_paths)
        else:
            print("Not exist old model")

        # 计算每个桶的样本的累计占比(1号桶的占比, 1+2号桶的占比, 1+2+3号桶的占比, 1+2+3+4号桶的占比)
        buckets_scale = [
            sum(bucket_sizes[:i + 1]) / total_size
            for i in range(len(bucket_sizes))
        ]

        # 开始训练
        metrics = '  '.join(
            ['\r[{}]', '{:.1f}%', '{}/{}', 'loss={:.3f}', '{}/{}'])

        # 设置bars_max跟踪进度
        bars_max = 20
        writer = tf.summary.FileWriter('log', graph=sess.graph)
        merges = []
        # 针对每个桶(每个训练对象)获取对应的summary的的可视化输出对象
        for b_idx in model.bucket_to_summary_list:
            merges.append(tf.summary.merge(
                model.bucket_to_summary_list[b_idx]))
        print("开始模型训练.....")
        with tf.device('/gpu:0'):
            for epoch_index in range(1, FLAGS.num_epoch + 1):
                print('Epoch {}:'.format(epoch_index))
                # 获取开始的时间
                time_start = time.time()
                # 设置开始的进度为 0
                epoch_trained = 0
                batch_loss = []
                while True:
                    # 随机选择一个要训练的bucket
                    random_number = np.random.random_sample()
                    bucket_id = min([
                        i for i in range(len(buckets_scale))
                        if buckets_scale[i] > random_number
                    ])
                    # 获取数据(从随机的桶中获取数据,获取batch_size: 16条数据)
                    data, _ = model.get_batch_data(bucket_dbs, bucket_id)
                    encoder_inputs, decoder_inputs, decoder_weights = model.get_batch(
                        bucket_id, data)

                    # run 迭代训练
                    _, step_loss, summary_merge, output = model.step(
                        sess, encoder_inputs, decoder_inputs, decoder_weights,
                        bucket_id, False, merges[bucket_id])

                    epoch_trained += FLAGS.batch_size
                    batch_loss.append(step_loss)
                    # 获取现在的时间
                    time_now = time.time()
                    # 获取经历的时间
                    time_spend = time_now - time_start
                    # 获取时间的进度
                    time_estimate = time_spend / (epoch_trained /
                                                  FLAGS.num_per_epoch)
                    # 获取现在的进度比例
                    percent = min(100,
                                  epoch_trained / FLAGS.num_per_epoch) * 100
                    # bars该显示多少个计算,最多显示20个
                    bars = math.floor(percent / 100 * bars_max)
                    # 进行输出操作,显示=号,-号的数量不同
                    sys.stdout.write(
                        metrics.format(
                            '=' * int(bars) + '-' * int(bars_max - bars),
                            percent, epoch_trained, FLAGS.num_per_epoch,
                            np.mean(batch_loss), data_utils.time(time_spend),
                            data_utils.time(time_estimate)))
                    # 进行输出
                    sys.stdout.flush()
                    if summary_merge is not None:
                        writer.add_summary(summary_merge,
                                           global_step=epoch_index)
                    if epoch_trained >= FLAGS.num_per_epoch:
                        model.saver.save(sess,
                                         os.path.join(FLAGS.model_dir,
                                                      FLAGS.model_name),
                                         global_step=epoch_index)
                        break
                print('\n')

        # 最终再来一次模型持久化输出
        model.saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_name))
Пример #6
0
def train():
    """训练模型"""
    # 准备数据
    print('准备数据')
    bucket_dbs = data_utils.read_bucket_dbs(FLAGS.buckets_dir)
    bucket_sizes = []
    for i in range(len(buckets)):
        bucket_size = bucket_dbs[i].size
        bucket_sizes.append(bucket_size)
        print('bucket {} 中有数据 {} 条'.format(i, bucket_size))
    total_size = sum(bucket_sizes)
    print('共有数据 {} 条'.format(total_size))
    # 开始建模与训练
    with tf.Session() as sess:
        # 构建模型
        model = create_model(sess, False)
        # 初始化变量
        sess.run(tf.initialize_all_variables())
        buckets_scale = [
            sum(bucket_sizes[:i + 1]) / total_size
            for i in range(len(bucket_sizes))
        ]
        # 开始训练
        metrics = '  '.join([
            '\r[{}]',
            '{:.1f}%',
            '{}/{}',
            'loss={:.3f}',
            '{}/{}',
            'learning rate={:.5f}'
        ])
        bars_max = 20
        for epoch_index in range(1, FLAGS.num_epoch + 1):
            print('Epoch {}:'.format(epoch_index))
            time_start = time.time()
            epoch_trained = 0
            batch_loss = []

            #
            previous_losses=[]
            current_step=0
            loss=0
            while True:
                # 选择一个要训练的bucket
                random_number = np.random.random_sample()
                bucket_id = min([
                    i for i in range(len(buckets_scale))
                    if buckets_scale[i] > random_number
                ])
                data, data_in = model.get_batch_data(
                    bucket_dbs,
                    bucket_id
                )
                encoder_inputs, decoder_inputs, decoder_weights = model.get_batch(
                    bucket_dbs,
                    bucket_id,
                    data
                )
                _, step_loss, output = model.step(
                    sess,
                    encoder_inputs,
                    decoder_inputs,
                    decoder_weights,
                    bucket_id,
                    False
                )

                loss=step_loss/FLAGS.steps_per_checkpoint
                current_step+=1

                if current_step % FLAGS.steps_per_checkpoint == 0:
                    if len(previous_losses)>2 and loss>max(previous_losses[-3:]):
                        sess.run(model.learning_rate_decay_op)
                    previous_losses.append(loss)
                    loss=0
                    if not os.path.exists(FLAGS.model_dir):
                        os.makedirs(FLAGS.model_dir)
                    model.saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_name))


                epoch_trained += FLAGS.batch_size
                batch_loss.append(step_loss)
                time_now = time.time()
                time_spend = time_now - time_start
                time_estimate = time_spend / (epoch_trained / FLAGS.num_per_epoch)
                percent = min(100, epoch_trained / FLAGS.num_per_epoch) * 100
                bars = math.floor(percent / 100 * bars_max)
                sys.stdout.write(metrics.format(
                    '=' * bars + '-' * (bars_max - bars),
                    percent,
                    epoch_trained, FLAGS.num_per_epoch,
                    np.mean(batch_loss),
                    data_utils.time(time_spend), data_utils.time(time_estimate),model.learning_rate.eval()
                ))
                sys.stdout.flush()
                if epoch_trained >= FLAGS.num_per_epoch:
                    break
            print('\n')

        if not os.path.exists(FLAGS.model_dir):
            os.makedirs(FLAGS.model_dir)
        model.saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_name))
Пример #7
0
def train():
    """训练模型"""
    # 准备数据
    print('准备数据')
    bucket_dbs = data_utils.read_bucket_dbs(FLAGS.buckets_dir)
    bucket_sizes = []
    for i in range(len(buckets)):
        bucket_size = bucket_dbs[i].size
        bucket_sizes.append(bucket_size)
        print('bucket {} 中有数据 {} 条'.format(i, bucket_size))
    total_size = sum(bucket_sizes)
    print('共有数据 {} 条'.format(total_size))

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.66)
    config = tf.ConfigProto(gpu_options=gpu_options)
    # 防止 out of memory
    config.gpu_options.allocator_type = 'BFC'

    # 开始建模与训练
    with tf.Session() as sess:
        # 构建模型
        model = create_model(sess, False)
        # 初始化变量
        sess.run(tf.initialize_all_variables())
        buckets_scale = [
            sum(bucket_sizes[:i + 1]) / total_size
            for i in range(len(bucket_sizes))
        ]
        # 开始训练
        metrics = '  '.join([
            '\r[{}]',
            '{:.1f}%',
            '{}/{}',
            'loss={:.3f}',
            '{}/{}'
        ])
        bars_max = 20
        for epoch_index in range(1, FLAGS.num_epoch + 1):
            print('Epoch {}:'.format(epoch_index))
            time_start = time.time()
            epoch_trained = 0
            batch_loss = []
            while True:
                # 选择一个要训练的bucket
                random_number = np.random.random_sample()
                bucket_id = min([
                    i for i in range(len(buckets_scale))
                    if buckets_scale[i] > random_number
                ])
                data, data_in = model.get_batch_data(
                    bucket_dbs,
                    bucket_id
                )
                encoder_inputs, decoder_inputs, decoder_weights = model.get_batch(
                    bucket_dbs,
                    bucket_id,
                    data
                )
                _, step_loss, output = model.step(
                    sess,
                    encoder_inputs,
                    decoder_inputs,
                    decoder_weights,
                    bucket_id,
                    False
                )
                epoch_trained += FLAGS.batch_size
                batch_loss.append(step_loss)
                time_now = time.time()
                time_spend = time_now - time_start
                time_estimate = time_spend / (epoch_trained / FLAGS.num_per_epoch)
                percent = min(100, epoch_trained / FLAGS.num_per_epoch) * 100
                bars = math.floor(percent / 100 * bars_max)
                sys.stdout.write(metrics.format(
                    '=' * bars + '-' * (bars_max - bars),
                    percent,
                    epoch_trained, FLAGS.num_per_epoch,
                    np.mean(batch_loss),
                    data_utils.time(time_spend), data_utils.time(time_estimate)
                ))
                sys.stdout.flush()
                if epoch_trained >= FLAGS.num_per_epoch:
                    break
            print('\n')

        if not os.path.exists(FLAGS.model_dir):
            os.makedirs(FLAGS.model_dir)
        model.saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_name))
Пример #8
0
def train():
    """训练模型"""
    # 准备数据
    print('准备数据')
    #数据预处理有两步:1、decode_conv 2、data_utils
    #原始数据集不是很好的问答式数据集。用decode_conv处理的数据,假定有ABC三个句子,则处理成两句问答:A:B,B:C,然后都插入到sqlite3里
    #生成一个conversion.db文件,然后使用data_utils来进行语句处理,即对这个db文件做进一步处理
    #对应四种格式,5_15,10_20,15_25,20_30,分别代表问句和回答句的字数上限。比如5_15即问句不超过5个字且答句不超过15个字。
    #这种方式也和命名实体识别的一个性质,是为了能最小padding,进行局部padding,如果有句子太长的,但是不太多,那么可以滤掉。
    #因为一般的对话都不会太长
    bucket_dbs = data_utils.read_bucket_dbs(FLAGS.buckets_dir)
    bucket_sizes = []
    for i in range(len(buckets)):
        bucket_size = bucket_dbs[i].size
        bucket_sizes.append(bucket_size)
        print('bucket {} 中有数据 {} 条'.format(i, bucket_size))
    total_size = sum(bucket_sizes)
    print('共有数据 {} 条'.format(total_size))
    #到这里为止还只是拿到四个bucket里的数据,并统计了一下总的数据条数
    # 开始建模与训练
    with tf.Session() as sess:
        #整体流程即:1、创建模型 2、接收数据,并转换成模型可接收的类型 3、放入模型,计算损失 4、更新参数
        # 构建模型
        model = create_model(sess, False)
        # 初始化变量
        sess.run(tf.global_variables_initializer())
        buckets_scale = [
            sum(bucket_sizes[:i + 1]) / total_size
            for i in range(len(bucket_sizes))
        ]  #i=0,1,2,3==>bucket_sizes[: 1],
        # 开始训练
        metrics = '  '.join(
            ['\r[{}]', '{:.1f}%', '{}/{}', 'loss={:.3f}', '{}/{}'])
        bars_max = 20
        with tf.device('/gpu:0'):
            for epoch_index in range(1, FLAGS.num_epoch + 1600):
                print('Epoch {}:'.format(epoch_index))
                time_start = time.time()
                epoch_trained = 0
                batch_loss = []
                while True:
                    # 选择一个要训练的bucket
                    random_number = np.random.random_sample()
                    #tmp=[]
                    #for i in range(len(buckets_scale)):
                    #if buckets_scale[i] > random_number:
                    #tmp.append(i)
                    #bucket_id = min(tmp)
                    bucket_id = 1 if random_number <= 0.25 else 2 if random_number > 0.25 and random_number <= 0.5 else 3 if random_number > 0.5 and random_number < 0.75 else 4
                    bucket_id -= 1
                    #先选择对应的问答对长度,因为后面无论是padding还是生结果,都是根据这个位数来的
                    #bucket_id = min([i for i in range(len(buckets_scale)) if buckets_scale[i] > random_number])
                    #拿出64个问答对,data 和data_in 问答倒转
                    data, data_in = model.get_batch_data(
                        bucket_dbs, bucket_id)  #先获取到问答对和答问对
                    encoder_inputs, decoder_inputs, decoder_weights = model.get_batch(
                        bucket_dbs, bucket_id, data
                    )  #再得到padding后的encoder_inputs,decoder_inputs和新生成的权重decoder_weights
                    #而这里的encoder_inputs,decoder_inputs都只是对应的字ID信息,而decoder_weights则是1和0组成的,也是和字位置一对一对应
                    #通过源码可以看出,ID只是初步信息,随机初始化一个embedding是embedding_attention_seq2seq内部会有的
                    _, step_loss, output = model.step(
                        sess, encoder_inputs, decoder_inputs, decoder_weights,
                        bucket_id, False
                    )  #给定需要喂入的参数,即encoder、decoder、weights以及选择的bucket_id
                    #根据训练和测试状态,获取输出结果。
                    epoch_trained += FLAGS.batch_size
                    batch_loss.append(step_loss)  #为了计算损失用
                    time_now = time.time()
                    time_spend = time_now - time_start
                    time_estimate = time_spend / (epoch_trained /
                                                  FLAGS.num_per_epoch)
                    percent = min(100,
                                  epoch_trained / FLAGS.num_per_epoch) * 100
                    bars = math.floor(percent / 100 * bars_max)
                    sys.stdout.write(
                        metrics.format('=' * bars + '-' * (bars_max - bars),
                                       percent,
                                       epoch_trained, FLAGS.num_per_epoch,
                                       np.mean(batch_loss),
                                       data_utils.time(time_spend),
                                       data_utils.time(time_estimate)))
                    sys.stdout.flush()
                    if epoch_trained >= FLAGS.num_per_epoch:
                        break
                print('\n')

        if not os.path.exists(FLAGS.model_dir):
            os.makedirs(FLAGS.model_dir)
        if epoch_index % 800 == 0:
            model.saver.save(sess,
                             os.path.join(FLAGS.model_dir, FLAGS.model_name))
Пример #9
0
def test_bleu(count):
    """测试bleu"""
    from nltk.translate.bleu_score import sentence_bleu
    from tqdm import tqdm
    # 准备数据
    print('准备数据')
    bucket_dbs = data_utils.read_bucket_dbs(FLAGS.buckets_dir)
    bucket_sizes = []
    for i in range(len(buckets)):
        bucket_size = bucket_dbs[i].size
        bucket_sizes.append(bucket_size)
        print('bucket {} 中有数据 {} 条'.format(i, bucket_size))
    total_size = sum(bucket_sizes)
    print('共有数据 {} 条'.format(total_size))
    # bleu设置0的话,默认对所有样本采样
    if count <= 0:
        count = total_size
    buckets_scale = [
        sum(bucket_sizes[:i + 1]) / total_size
        for i in range(len(bucket_sizes))
    ]
    with tf.Session() as sess:
        # 构建模型
        model = create_model(sess, True)
        model.batch_size = 1
        # 初始化变量
        sess.run(tf.initialize_all_variables())
        model.saver.restore(sess, os.path.join(FLAGS.model_dir, FLAGS.model_name))

        total_score = 0.0
        for i in tqdm(range(count)):
            # 选择一个要训练的bucket
            random_number = np.random.random_sample()
            bucket_id = min([
                i for i in range(len(buckets_scale))
                if buckets_scale[i] > random_number
            ])
            data, _ = model.get_batch_data(
                bucket_dbs,
                bucket_id
            )
            encoder_inputs, decoder_inputs, decoder_weights = model.get_batch(
                bucket_dbs,
                bucket_id,
                data
            )
            _, _, output_logits = model.step(
                sess,
                encoder_inputs,
                decoder_inputs,
                decoder_weights,
                bucket_id,
                True
            )
            outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
            ask, _ = data[0]
            all_answers = bucket_dbs[bucket_id].all_answers(ask)
            ret = data_utils.indice_sentence(outputs)
            if not ret:
                continue
            references = [list(x) for x in all_answers]
            score = sentence_bleu(
                references,
                list(ret),
                weights=(1.0,)
            )
            total_score += score
        print('BLUE: {:.2f} in {} samples'.format(total_score / count * 10, count))
Пример #10
0
def train():
    """训练模型"""
    # 准备数据
    print('准备数据')
    bucket_dbs = data_utils.read_bucket_dbs(FLAGS.buckets_dir)
    bucket_sizes = []
    for i in range(len(buckets)):
        bucket_size = bucket_dbs[i].size
        bucket_sizes.append(bucket_size)
        print('bucket {} 中有数据 {} 条'.format(i, bucket_size))
    total_size = sum(bucket_sizes)
    print('共有数据 {} 条'.format(total_size))
    # 开始建模与训练
    with tf.Session() as sess:
        # 构建模型
        model = create_model(sess, False)
        # 初始化变量
        sess.run(tf.initialize_all_variables())
        buckets_scale = [
            sum(bucket_sizes[:i + 1]) / total_size
            for i in range(len(bucket_sizes))
        ]
        # 开始训练
        metrics = '  '.join([
            '\r[{}]',
            '{:.1f}%',
            '{}/{}',
            'loss={:.3f}',
            '{}/{}'
        ])
        bars_max = 20
        for epoch_index in range(1, FLAGS.num_epoch + 1):
            print('Epoch {}:'.format(epoch_index))
            time_start = time.time()
            epoch_trained = 0
            batch_loss = []
            while True:
                # 选择一个要训练的bucket
                random_number = np.random.random_sample()
                bucket_id = min([
                    i for i in range(len(buckets_scale))
                    if buckets_scale[i] > random_number
                ])
                data, data_in = model.get_batch_data(
                    bucket_dbs,
                    bucket_id
                )
                encoder_inputs, decoder_inputs, decoder_weights = model.get_batch(
                    bucket_dbs,
                    bucket_id,
                    data
                )
                _, step_loss, output = model.step(
                    sess,
                    encoder_inputs,
                    decoder_inputs,
                    decoder_weights,
                    bucket_id,
                    False
                )
                epoch_trained += FLAGS.batch_size
                batch_loss.append(step_loss)
                time_now = time.time()
                time_spend = time_now - time_start
                time_estimate = time_spend / (epoch_trained / FLAGS.num_per_epoch)
                percent = min(100, epoch_trained / FLAGS.num_per_epoch) * 100
                bars = math.floor(percent / 100 * bars_max)
                sys.stdout.write(metrics.format(
                    '=' * bars + '-' * (bars_max - bars),
                    percent,
                    epoch_trained, FLAGS.num_per_epoch,
                    np.mean(batch_loss),
                    data_utils.time(time_spend), data_utils.time(time_estimate)
                ))
                sys.stdout.flush()
                if epoch_trained >= FLAGS.num_per_epoch:
                    break
            print('\n')

        if not os.path.exists(FLAGS.model_dir):
            os.makedirs(FLAGS.model_dir)
        model.saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_name))
Пример #11
0
def train():
    """訓練模型"""
    # 准备数据
    print('準備數據')
    bucket_dbs = data_utils.read_bucket_dbs(FLAGS.buckets_dir)
    bucket_sizes = []
    for i in range(len(buckets)):
        bucket_size = bucket_dbs[i].size
        bucket_sizes.append(bucket_size)
        print('bucket {} 中有數據 {} 條'.format(i, bucket_size))
    total_size = sum(bucket_sizes)
    print('共有數據 {} 條'.format(total_size))
    # 開始建模
    with tf.Session() as sess:
        # 構建模型
        model = create_model(sess, False)
        # 初始化變量
        sess.run(tf.initialize_all_variables())
        buckets_scale = [
            sum(bucket_sizes[:i + 1]) / total_size
            for i in range(len(bucket_sizes))
        ]
        # 開始訓練
        metrics = '  '.join(
            ['\r[{}]', '{:.1f}%', '{}/{}', 'loss={:.3f}', '{}/{}'])
        bars_max = 20
        for epoch_index in range(1, FLAGS.num_epoch + 1):
            print('Epoch {}:'.format(epoch_index))
            time_start = time.time()
            epoch_trained = 0
            batch_loss = []
            while True:
                # 選擇一個要訓練的bucket
                random_number = np.random.random_sample()
                bucket_id = min([
                    i for i in range(len(buckets_scale))
                    if buckets_scale[i] > random_number
                ])
                data, data_in = model.get_batch_data(bucket_dbs, bucket_id)
                encoder_inputs, decoder_inputs, decoder_weights = model.get_batch(
                    bucket_dbs, bucket_id, data)
                _, step_loss, output = model.step(sess, encoder_inputs,
                                                  decoder_inputs,
                                                  decoder_weights, bucket_id,
                                                  False)
                epoch_trained += FLAGS.batch_size
                batch_loss.append(step_loss)
                time_now = time.time()
                time_spend = time_now - time_start
                time_estimate = time_spend / (epoch_trained /
                                              FLAGS.num_per_epoch)
                percent = min(100, epoch_trained / FLAGS.num_per_epoch) * 100
                bars = math.floor(percent / 100 * bars_max)
                sys.stdout.write(
                    metrics.format('=' * bars + '-' * (bars_max - bars),
                                   percent, epoch_trained, FLAGS.num_per_epoch,
                                   np.mean(batch_loss),
                                   data_utils.time(time_spend),
                                   data_utils.time(time_estimate)))
                sys.stdout.flush()
                if epoch_trained >= FLAGS.num_per_epoch:
                    break
            print('\n')

        if not os.path.exists(FLAGS.model_dir):
            os.makedirs(FLAGS.model_dir)
        model.saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_name))
Пример #12
0
def train():
    """训练模型"""
    print('数据准备中...')
    bucket_dbs = data_utils.read_bucket_dbs(FLAGS.buckets_dir)
    bucket_sizes = []
    for i in range(len(buckets)):
        bucket_size = bucket_dbs[i].size
        bucket_sizes.append(bucket_size)
        print('bucket {} 中有数据 {} 条'.format(i, bucket_size))
    total_size = sum(bucket_sizes)
    print('共有数据 {} 条'.format(total_size))
    
    with tf.Session() as sess:
        model = create_model(sess, False)
        sess.run(tf.global_variables_initializer())
        # 计算每个文件数据占比
        buckets_scale = [sum(bucket_sizes[:i + 1]) / total_size for i in range(len(bucket_sizes))]
        # 格式化控制台输出
        metrics = '  '.join([
            '\r[{}]',
            '{:.1f}%',
            '{}/{}',
            'loss={:.3f}',
            '{}/{}'
        ])
        bars_max = 20
        with tf.device('/gpu:0'):
            for epoch_index in range(1, FLAGS.num_epoch + 1600):
                print('Epoch {}:'.format(epoch_index))
                time_start = time.time()
                epoch_trained = 0 # 每个epoch已经训练的样本数
                batch_loss = []
                while True:
                    # 随机选择一个要训练的bucket_id
                    random_number = np.random.random_sample()
                    bucket_id = min([i for i in range(len(buckets_scale)) if buckets_scale[i] > random_number])
                    # 拿出64个问答对  data, data_in 问答倒转
                    data, data_in = model.get_batch_data(
                        bucket_dbs,
                        bucket_id
                    )
                    # 将问答对转换为模型训练可接受的格式
                    # bucket_10_20这个bucket对应的维度为:10*64 20*64 20*64
                    encoder_inputs, decoder_inputs, decoder_weights = model.get_batch(
                        bucket_dbs,
                        bucket_id,
                        data
                    )
                    # 训练
                    _, step_loss, output = model.step(
                        sess,
                        encoder_inputs,
                        decoder_inputs,
                        decoder_weights,
                        bucket_id,
                        False
                    )
                    epoch_trained += FLAGS.batch_size
                    batch_loss.append(step_loss)
                    time_now = time.time()
                    time_spend = time_now - time_start
                    time_estimate = time_spend / (epoch_trained / FLAGS.num_per_epoch)
                    percent = min(100, epoch_trained / FLAGS.num_per_epoch) * 100
                    bars = math.floor(percent / 100 * bars_max)
                    sys.stdout.write(metrics.format(
                        '=' * bars + '-' * (bars_max - bars),
                        percent,
                        epoch_trained, FLAGS.num_per_epoch,
                        np.mean(batch_loss),
                        data_utils.time(time_spend), data_utils.time(time_estimate)
                    ))
                    sys.stdout.flush()
                    if epoch_trained >= FLAGS.num_per_epoch:
                        break
                print('\n')

        if not os.path.exists(FLAGS.model_dir):
            os.makedirs(FLAGS.model_dir)
        if epoch_index%800==0:
            model.saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_name))