示例#1
0
def main(global_step=global_step):
    train_dataset = dataset.get_train_dataset(src_file=config.train_src_file, tgt_file=config.train_tgt_file,
                                              tgt_vocab_table=tgt_vocab_table, batch_size=config.batch_size)
    init_acc = 0
    if config.eval_only:
        logger.info("======== Evaluation only ===============")
        test_acc = infer()
        logger.info("Test acc {:.4f}".format(test_acc))
    else:
        for epoch in range(config.max_epochs):
            total_loss, total_cnt, step_time = 0.0, 0.0, 0.0
            for batch_data in train_dataset.take(config.steps_per_epoch):
                start_time = time.time()
                src_inputs, tgt_input_ids, tgt_output_ids, src_len, tgt_len = batch_data
                batch_size = src_inputs.shape[0]
                batch_loss = train_step(batch_data)
                total_loss += batch_loss * batch_size
                total_cnt += batch_size
                step_time += time.time() - start_time
                if (global_step + 1) % 100 == 0:
                    train_loss = total_loss / total_cnt
                    speed = total_cnt / step_time
                    logger.info("epoch {} global_step {} example-time {:.2f} total loss: {:.4f}".
                                format(epoch, global_step + 1, speed, train_loss))
                    total_loss, total_cnt, step_time = 0.0, 0.0, 0.0
                global_step += 1
            test_acc = infer()
            checkpoint.save(file_prefix=chkpoint_prefix + "_acc_{:.4f}".format(test_acc) + "-" + str(global_step))
            logger.info("Saving model to {}".format(
                chkpoint_prefix + "_acc_{:.4f}".format(test_acc) + "-" + str(global_step)))
            if test_acc > init_acc:
                checkpoint.save(
                    file_prefix=best_output + "_acc_{:.4f}".format(test_acc) + "-" + str(global_step))
                init_acc = test_acc
                logger.info("Currently the best acc {:.4f}".format(test_acc))
示例#2
0
def main(global_step=global_step):
    train_dataset = dataset.get_train_dataset(src_file=config.train_src_file,
                                              tgt_file=config.train_tgt_file,
                                              tgt_vocab_table=tgt_vocab_table,
                                              batch_size=config.batch_size)
    init_bleu = 0
    if config.eval_only:
        logger.info("======== Evaluation only ===============")
        eval_bleu, eval_loss = eval()
        test_bleu = infer()
        logger.info("Eval loss {:.4f}, bleu {:.4f}".format(
            eval_loss, eval_bleu))
        logger.info("Test bleu {:.4f}".format(test_bleu))
    else:
        for epoch in range(global_epoch + 1, config.max_epochs):
            total_loss, total_cnt, step_time = 0.0, 0.0, 0.0
            for batch_data in train_dataset.take(config.steps_per_epoch):
                start_time = time.time()
                src_inputs, tgt_input_ids, tgt_output_ids, src_path, src_len, tgt_len = batch_data
                batch_size = src_inputs.shape[0]
                batch_loss = train_step(batch_data)
                total_loss += batch_loss * batch_size
                total_cnt += batch_size
                step_time += time.time() - start_time
                if (global_step + 1) % 100 == 0:
                    train_loss = total_loss / total_cnt
                    train_ppl = misc_utils.safe_exp(total_loss / total_cnt)
                    speed = total_cnt / step_time
                    # current_lr = learning_rate(global_step)
                    logger.info(
                        "epoch {} global_step {} example-time {:.2f} total loss: {:.4f} ppl {:.4f}"
                        .format(epoch, global_step + 1, speed, train_loss,
                                train_ppl))
                    if math.isnan(train_ppl):
                        break
                    total_loss, total_cnt, step_time = 0.0, 0.0, 0.0
                global_step = tf.add(global_step, 1)
            eval_bleu, eval_loss = eval()
            test_bleu = infer()
            logger.info(
                "Epoch {}, Internal eval bleu {:.4f} loss {:.4f}, External test bleu {:.4f}"
                .format(epoch, eval_bleu, eval_loss, test_bleu))
            checkpoint.save(file_prefix=chkpoint_prefix +
                            "_bleu_{:.4f}".format(test_bleu) + "-" +
                            str(global_step))
            logger.info(
                "Saving model to {}".format(chkpoint_prefix +
                                            "_bleu_{:.4f}".format(test_bleu) +
                                            "-" + str(global_step)))
            if test_bleu > init_bleu:
                checkpoint.save(file_prefix=best_output +
                                "_bleu_{:.4f}".format(test_bleu) + "-" +
                                str(global_step))
                init_bleu = test_bleu
                logger.info("Currently the best bleu {:.4f}".format(test_bleu))
示例#3
0
def main(global_step=global_step):
    train_dataset = dataset.get_train_dataset(src_file=config.train_src_file,
                                              tgt_file=config.train_tgt_file,
                                              tgt_vocab_table=tgt_vocab_table,
                                              batch_size=config.batch_size)
    init_wer = 0
    if config.eval_only:
        logger.info("======== Evaluation only ===============")
        test_wer = infer()
        logger.info("Test wer {:.4f}".format(test_wer))
    else:
        for epoch in range(config.max_epochs):
            total_ctc_loss, total_reg_loss, total_cnt, step_time = 0.0, 0.0, 0.0, 0.0
            for batch_data in train_dataset.take(config.steps_per_epoch):
                start_time = time.time()
                src_inputs, tgt_input_ids, tgt_output_ids, src_path, src_len, tgt_len = batch_data
                batch_size = src_inputs.shape[0]
                ctc_loss, reg_loss = train_step(batch_data, epoch,
                                                global_step == 0)
                total_ctc_loss += ctc_loss * batch_size
                total_reg_loss += reg_loss * batch_size
                total_cnt += batch_size
                step_time += time.time() - start_time
                if (global_step + 1) % 100 == 0:
                    train_ctc_loss = total_ctc_loss / total_cnt
                    train_reg_loss = total_reg_loss / total_cnt
                    speed = total_cnt / step_time
                    logger.info(
                        "epoch {} global_step {} example-time {:.2f} ctc loss: {:.4f} reg loss: {:.4f}"
                        .format(epoch, global_step + 1, speed, train_ctc_loss,
                                train_reg_loss))
                    total_ctc_loss, total_reg_loss, total_cnt, step_time = 0.0, 0.0, 0.0, 0.0
                global_step = tf.add(global_step, 1)

            test_wer = infer()
            save_file_prefix = chkpoint_prefix + "_wer_{:.4f}".format(
                test_wer) + "-" + str(global_step.numpy())
            checkpoint.save(save_file_prefix)
            logger.info("Saving model to {}".format(save_file_prefix))
            if test_wer > init_wer:
                checkpoint.save(file_prefix=best_output_predfix +
                                "_wer_{:.4f}".format(test_wer) + "-" +
                                str(global_step.numpy()))
                init_wer = test_wer
                logger.info("Currently the best wer {:.4f}".format(test_wer))
示例#4
0
def eval():
    """internal evaluation """
    dev_dataset = dataset.get_train_dataset(src_file=config.eval_src_file,
                                            tgt_file=config.eval_tgt_file,
                                            tgt_vocab_table=tgt_vocab_table,
                                            batch_size=config.batch_size)
    total_cnt, total_loss, total_bleu = 0.0, 0.0, 0.0
    for batch_num, batch_data in enumerate(dev_dataset.take(config.debug_num)):
        src_inputs, tgt_input_ids, tgt_output_ids, src_path, src_len, tgt_len = batch_data
        logits = model(batch_data, training=True)
        bs = logits.shape[0]
        xentropy, weights = metrics.padded_cross_entropy_loss(
            logits,
            tgt_output_ids,
            config.label_smoothing,
            vocab_size=tgt_vocab_size)
        batch_loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)
        batch_bleu = metrics.bleu_score(logits=logits, labels=tgt_output_ids)
        total_cnt += bs
        total_loss += bs * batch_loss
        total_bleu += bs * batch_bleu
    eval_loss = total_loss / total_cnt
    eval_bleu = total_bleu / total_cnt
    return eval_bleu, eval_loss
def main_worker(gpus, args):
    # 定义模型,损失函数,优化器
    model = resnet18()
    torch.cuda.set_device('cuda:{}'.format(gpus[0]))
    model.cuda()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=0.9,
                                weight_decay=1e-4)

    # apex initialization
    model, optimizer = amp.initialize(model, optimizer)

    # 如果使用的GPU数量大于1,需要用nn.DataParallel来修饰模型
    if len(gpus) > 1:
        model = nn.DataParallel(model, device_ids=gpus, output_device=gpus[0])

    # Define Training Schedule and Dataloader
    train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                                     milestones=[60, 120, 160],
                                                     gamma=0.2)
    train_dataset = get_train_dataset()
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               num_workers=4,
                                               pin_memory=True)
    test_dataset = get_test_dataset()
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=args.batch_size,
                                              num_workers=4,
                                              pin_memory=True)

    # Training
    for epoch in range(args.epochs):
        start = time.time()
        model.train()

        # 设置 train_scheduler 来调整学习率
        train_scheduler.step(epoch)

        for step, (images, labels) in enumerate(train_loader):
            # 将对应进程的数据放到 GPU 上
            images = images.cuda(non_blocking=True)
            labels = labels.cuda(non_blocking=True)

            outputs = model(images)
            loss = criterion(outputs, labels)

            # 更新优化模型权重
            optimizer.zero_grad()
            # loss.backward()
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            optimizer.step()

            print(
                'Training Epoch: {epoch} [{trained_samples}/{total_samples}]\tLoss: {:0.4f}\tLR: {:0.6f}'
                .format(loss,
                        optimizer.param_groups[0]['lr'],
                        epoch=epoch + 1,
                        trained_samples=step * args.batch_size + len(images),
                        total_samples=len(train_loader.dataset)))

        finish = time.time()
        print('epoch {} training time consumed: {:.2f}s'.format(
            epoch, finish - start))

        # validate after every epoch
        validate(test_loader, model, criterion)
示例#6
0
    if mask is not None:
        scaled_attention_logits += (mask * 1e-9)
    # softmax is normalized on the last axis (seq_len_k) so that the scores add up to 1.
    attention_weights = tf.nn.softmax(scaled_attention_logits,
                                      axis=-1)  # (..., q_len, kv_len)
    output = tf.matmul(attention_weights,
                       v)  # [.., q_len, d_model] ? [.., k_len, d_model]
    return output, attention_weights, scaled_attention_logits


# print(os.getcwd())
base_path = "/home/panxie/Documents/sign-language/nslt/Data"
src_file = base_path + "/phoenix2014T.dev.sign"
tgt_file = base_path + "/phoenix2014T.dev.de"
tgt_vocab_table = create_tgt_vocab_table(base_path + "/phoenix2014T.vocab.de")
dataset = dataset.get_train_dataset(src_file, tgt_file, tgt_vocab_table)
cnt = 0
for data in dataset.take(1):
    cnt += 1
    src_inputs, tgt_in, tgt_out, src_path, src_len, tgt_len = data
    bs, t, h, w, c = src_inputs.shape
    print(src_inputs.shape, src_path)
    src_inputs = tf.reshape(src_inputs, (bs * t, h, w, c))
    cnn_output = resnet_model(src_inputs, training=False)
    cnn_output = tf.reshape(cnn_output, (bs, t, -1))
    attention_out, atten_weights, atten_logits = scaled_dot_product_attention(
        cnn_output, cnn_output, cnn_output, mask=None)
    for i in range(100):
        # print(atten_logits[0, i, :])
        print(tf.nn.top_k(atten_logits[0, i, :], k=10).indices)
def main_worker(local_rank, nprocs, args):
    args.local_rank = local_rank
    init_seeds(local_rank+1)
    # 获得init_method的通信端口
    init_method = 'tcp://' + args.ip + ':' + args.port

    # 1. 分布式初始化,对于每一个进程都需要进行初始化,所以定义在 main_worker中
    cudnn.benchmark = True
    dist.init_process_group(backend='nccl', init_method=init_method, world_size=args.nprocs,
                            rank=local_rank)

    # 2. 基本定义,模型-损失函数-优化器
    model = resnet18()
    torch.cuda.set_device(local_rank)
    model.cuda(local_rank)
    criterion = nn.CrossEntropyLoss().cuda(local_rank)
    optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=0.9, weight_decay=1e-4)
    train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[60, 120, 160], gamma=0.2)

    # apex初始化
    model = apex.parallel.convert_syncbn_model(model).to(local_rank) # 使用 apex 提供的 SyncBatchNorm 操作
    model, optimizer = amp.initialize(model, optimizer)
    model = DDP(model)

    # 3. 加载数据,
    batch_size = int(args.batch_size / nprocs)

    train_dataset = get_train_dataset()
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=4, pin_memory=True, sampler=train_sampler)

    test_dataset = get_test_dataset()
    test_sampler = torch.utils.data.distributed.DistributedSampler(test_dataset)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, num_workers=4, pin_memory=True, sampler=test_sampler)

    for epoch in range(args.epochs):
        start = time.time()
        model.train()
        train_sampler.set_epoch(epoch)
        train_scheduler.step(epoch)

        for step, (images, labels) in enumerate(train_loader):
            # 将对应进程的数据放到对应 GPU 上
            images = images.cuda(local_rank, non_blocking=True)
            labels = labels.cuda(local_rank, non_blocking=True)

            outputs = model(images)
            loss = criterion(outputs, labels)

            torch.distributed.barrier()
            reduced_loss = reduce_mean(loss, args.nprocs)

            # 更新优化模型权重, 用scale_loss修饰loss
            optimizer.zero_grad()
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            optimizer.step()

            if args.local_rank == 0:
                print(
                    'Training Epoch: {epoch} [{trained_samples}/{total_samples}]\tLoss: {:0.4f}\tLR: {:0.6f}'.format(
                        reduced_loss,
                        optimizer.param_groups[0]['lr'],
                        epoch=epoch+1,
                        trained_samples=step * args.batch_size + len(images),
                        total_samples=len(train_loader.dataset)
                    ))

        finish = time.time()
        if args.local_rank == 0:
            print('epoch {} training time consumed: {:.2f}s'.format(epoch, finish - start))

        # validate after every epoch
        validate(test_loader, model, criterion, local_rank, args)
示例#8
0
def main_worker(local_rank, nprocs, args):
    args.local_rank = local_rank
    init_seeds(local_rank + 1)  # set different seed for each worker
    # 获得init_method的通信端口
    init_method = 'tcp://' + args.ip + ':' + args.port

    # 1. 分布式初始化,对于每一个进程都需要进行初始化,所以定义在 main_worker中
    cudnn.benchmark = True
    dist.init_process_group(backend='nccl',
                            init_method=init_method,
                            world_size=args.nprocs,
                            rank=local_rank)

    # 2. 基本定义,模型-损失函数-优化器
    model = resnet18(
    )  # 定义模型,将对应进程放到对应的GPU上, .cuda(local_rank) / .set_device(local_rank)

    # 以下是需要加 local_rank 的部分:模型
    # ================================
    torch.cuda.set_device(local_rank)  # 使用 set_device 和 cuda 来指定需要的 GPU
    model.cuda(local_rank)
    model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(local_rank)
    model = torch.nn.parallel.DistributedDataParallel(
        model, device_ids=[local_rank])  # 将模型用 DistributedDataParallel 包裹
    # =================================
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=0.9,
                                weight_decay=1e-4)
    train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                                     milestones=[60, 120, 160],
                                                     gamma=0.2)

    # 3. 加载数据,
    batch_size = int(args.batch_size /
                     nprocs)  # 需要手动划分 batch_size 为 mini-batch_size

    train_dataset = get_train_dataset()
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset)
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               num_workers=4,
                                               pin_memory=True,
                                               sampler=train_sampler)

    test_dataset = get_test_dataset()
    test_sampler = torch.utils.data.distributed.DistributedSampler(
        test_dataset)
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=batch_size,
                                              num_workers=4,
                                              pin_memory=True,
                                              sampler=test_sampler)

    for epoch in range(args.epochs):
        start = time.time()
        model.train()
        # 需要设置sampler的epoch为当前epoch来保证dataloader的shuffle的有效性
        train_sampler.set_epoch(epoch)

        # 设置 train_scheduler 来调整学习率
        train_scheduler.step(epoch)

        for step, (images, labels) in enumerate(train_loader):
            # 将对应进程的数据放到 GPU 上
            images = images.cuda(non_blocking=True)
            labels = labels.cuda(non_blocking=True)

            outputs = model(images)
            loss = criterion(outputs, labels)

            # torch.distributed.barrier()的作用是,阻塞进程,保证每个进程运行完这一行代码之前的所有代码,才能继续执行,这样才计算平均loss和平均acc的时候不会出现因为进程执行速度不一致的错误
            torch.distributed.barrier()
            reduced_loss = reduce_mean(loss, args.nprocs)

            # 更新优化模型权重
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if args.local_rank == 0:
                print(
                    'Training Epoch: {epoch} [{trained_samples}/{total_samples}]\tLoss: {:0.4f}\tLR: {:0.6f}'
                    .format(reduced_loss,
                            optimizer.param_groups[0]['lr'],
                            epoch=epoch + 1,
                            trained_samples=step * args.batch_size +
                            len(images),
                            total_samples=len(train_loader.dataset)))

        finish = time.time()
        if args.local_rank == 0:
            print('epoch {} training time consumed: {:.2f}s'.format(
                epoch, finish - start))

        # validate after every epoch
        validate(test_loader, model, criterion, local_rank, args)