def train():
    torch.cuda.set_device(0)
    iteration = 0
    model = WaveRNN(HPARAMS)
    model = model.cuda()
    optimizer = optim.Adam(model.parameters(), lr=HPARAMS.lr)

    if ARGS.checkpoint:
        if os.path.basename(ARGS.checkpoint).startswith('ema_model'):
            ema_checkpoint = ARGS.checkpoint
        else:
            ema_checkpoint = 'ema_model_' + os.path.basename(ARGS.checkpoint)
            ema_checkpoint = os.path.join(os.path.dirname(ARGS.checkpoint), ema_checkpoint)

        # Initialise EMA from the ema checkpoint.
        logging.info('Initialising ema model {}'.format(ema_checkpoint))
        ema_model = WaveRNN(HPARAMS).cuda()
        ema_base_model, _ = load_checkpoint(ema_checkpoint, ema_model)
        ema = init_ema(ema_base_model, HPARAMS.ema_rate)

        # Initialise vanilla model
        logging.info('Loading checkpoint {}'.format(ARGS.checkpoint))
        model, iteration, optimizer = load_checkpoint(ARGS.checkpoint, model, optimizer)

    else:
        # Initialise EMA from scratch.
        ema = init_ema(model, HPARAMS.ema_rate)

    criterion = nn.NLLLoss(reduction='sum').cuda()
    train_loader, test_loader = get_loader(ARGS.data, 'train', HPARAMS), get_loader(ARGS.data, 'valid', HPARAMS)
    whole_loader = get_loader(ARGS.data, 'valid', HPARAMS, whole=True)
    model = nn.DataParallel(model)

    epoch_offset = max(0, int(iteration / len(train_loader)))
    for _ in range(epoch_offset, ARGS.epochs):
        iteration = train_step(
            train_loader, test_loader, whole_loader, model, optimizer,
            criterion, iteration, ema=ema
        )

        averaged_model = clone_as_averaged_model(model, ema)
        save_checkpoint(
            {
                'state_dict': model.module.state_dict(),
                'iteration': iteration,
                'dataset': ARGS.data,
                'optimizer': optimizer.state_dict(),
            }, iteration,
            'checkpoints/{}/lastmodel.pth'.format(ARGS.expName), ARGS.expName,
        )
        save_checkpoint(
            {
                'state_dict': averaged_model.state_dict(),
                'iteration': iteration,
                'dataset': ARGS.data,
                'optimizer': optimizer.state_dict(),
            }, iteration,
            'checkpoints/{}/ema_model_lastmodel.pth'.format(ARGS.expName), ARGS.expName,
        )
Пример #2
0
def test_inference_forward_parity():
    hparams = create_hparams()
    model = WaveRNN(hparams, debug=True).cuda()
    model.train()
    data_path = '../data/short_sens/'
    whole_segments = get_loader(data_path, 'valid', hparams, whole=True)
    for i, (x, m, _) in enumerate(whole_segments):
        x, m = x.cuda(), m.cuda()
        forward_output, f_context, f_x = model.train_mode_generate(x, m)
        inference_output, i_cont_dict, i_x = model.inference(m, gt=x)
        assert (abs(i_x - f_x).mean() < 1e-6)
        '''
Пример #3
0
def visualize_attn(args):
    """ 
    Visualization of learned attention map.
    """
    config = CONFIGS[args.model_type]
    num_classes = 10 if args.dataset == "cifar10" else 100

    model = VisionTransformer(config, args.img_size, 
                            norm_type=args.norm_type, 
                            zero_head=True, 
                            num_classes=num_classes,
                            vis=True)

    ckpt_file = os.path.join(args.output_dir, args.name + "_checkpoint.bin")
    ckpt = torch.load(ckpt_file)
    # use single card for visualize attn map
    model.load_state_dict(ckpt)
    model.to(args.device)
    model.eval()
    
    _, test_loader = get_loader(args)
    sample_idx = 0
    layer_ids = [0, 3, 6, 9]
    head_id = 0
    with torch.no_grad():
        for step, batch in enumerate(test_loader):
            batch = tuple(t.to(args.device) for t in batch)
            x, y = batch
            select_x = x[sample_idx].unsqueeze(0)
            output, attn_weights = model(select_x)
            # attn_weights is List[(1, number_of_head, len_h, len_h)]
            for layer_id in layer_ids:
                vis_attn(args, attn_weights[layer_id].squeeze(0)[head_id], layer_id=layer_id)
            break # visualize the first sample in the first batch
    print("done.")
    exit(0)
Пример #4
0
def train(args, model, device, acc_calculator):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir, exist_ok=True)
        writer = SummaryWriter(log_dir=os.path.join("logs", args.name))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    # Prepare dataset
    train_loader, test_loader = get_loader(args)

    # Prepare optimizer and scheduler
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.learning_rate,
                                momentum=0.9,
                                weight_decay=args.weight_decay)
    t_total = args.num_steps
    if args.decay_type == "cosine":
        scheduler = WarmupCosineSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
    else:
        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Total optimization steps = %d", args.num_steps)
    logger.info("  Instantaneous batch size per GPU = %d", args.train_batch_size)
    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
                args.train_batch_size * args.gradient_accumulation_steps)
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)

    model.zero_grad()
    set_seed(args)  # Added here for reproducibility (even between python 2 and 3)
    losses = AverageMeter()
    global_step, best_acc = 0, 0
    loss_fct = BatchAllLoss(margin=0.1)
    while True:
        model.train()
        epoch_iterator = tqdm(train_loader,
                              desc="Training (X / X Steps) (loss=X.X)",
                              bar_format="{l_bar}{r_bar}",
                              dynamic_ncols=True,
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            batch = tuple(t.to(device) for t in batch[:2])
            x, y = batch
            embeds = model(x)
            loss = loss_fct(embeds, y)
            losses.update(loss.item())

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            global_step += 1

            epoch_iterator.set_description(
                "Training (%d / %d Steps) (loss=%2.5f)" % (global_step, t_total, losses.val)
            )
            if args.local_rank in [-1, 0]:
                writer.add_scalar("train/loss", scalar_value=losses.val, global_step=global_step)
                writer.add_scalar("train/lr", scalar_value=scheduler.get_lr()[0], global_step=global_step)
            if global_step % args.eval_every == 0 and args.local_rank in [-1, 0]:
                accuracy = valid(args, model, writer, test_loader, global_step, device, acc_calculator)
                if best_acc < accuracy:
                    save_model(args, model)
                    best_acc = accuracy
                model.train()

            if global_step % t_total == 0:
                break
        losses.reset()
        if global_step % t_total == 0:
            break

    if args.local_rank in [-1, 0]:
        writer.close()
    logger.info("Best Accuracy: \t%f" % best_acc)
    logger.info("End Training!")
Пример #5
0
def train(args, model):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir, exist_ok=True)
        writer = SummaryWriter(log_dir=os.path.join("logs", args.name))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    # Prepare dataset
    train_loader, test_loader = get_loader(args)

    # Prepare optimizer and scheduler
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.learning_rate,
                                momentum=0.9,
                                weight_decay=args.weight_decay)
    t_total = args.num_steps
    if args.decay_type == "cosine":
        scheduler = WarmupCosineSchedule(optimizer,
                                         warmup_steps=args.warmup_steps,
                                         t_total=t_total)
    else:
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=args.warmup_steps,
                                         t_total=t_total)

    if args.fp16:
        model, optimizer = amp.initialize(models=model,
                                          optimizers=optimizer,
                                          opt_level=args.fp16_opt_level)
        amp._amp_state.loss_scalers[0]._loss_scale = 2**20

    # Distributed training
    if args.local_rank != -1:
        model = DDP(model,
                    message_size=250000000,
                    gradient_predivide_factor=get_world_size())

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Total optimization steps = %d", args.num_steps)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)

    model.zero_grad()
    set_seed(
        args)  # Added here for reproducibility (even between python 2 and 3)
    losses = AverageMeter()
    global_step, best_acc = 0, 0
    while True:
        model.train()
        epoch_iterator = tqdm(train_loader,
                              desc="Training (X / X Steps) (loss=X.X)",
                              bar_format="{l_bar}{r_bar}",
                              dynamic_ncols=True,
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            batch = tuple(t.to(args.device) for t in batch)
            x, y = batch
            loss = model(x, y)

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if (step + 1) % args.gradient_accumulation_steps == 0:
                losses.update(loss.item() * args.gradient_accumulation_steps)
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                scheduler.step()
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

                epoch_iterator.set_description(
                    "Training (%d / %d Steps) (loss=%2.5f)" %
                    (global_step, t_total, losses.val))
                if args.local_rank in [-1, 0]:
                    writer.add_scalar("train/loss",
                                      scalar_value=losses.val,
                                      global_step=global_step)
                    writer.add_scalar("train/lr",
                                      scalar_value=scheduler.get_lr()[0],
                                      global_step=global_step)
                if global_step % args.eval_every == 0 and args.local_rank in [
                        -1, 0
                ]:
                    accuracy = valid(args, model, writer, test_loader,
                                     global_step)
                    if best_acc < accuracy:
                        save_model(args, model)
                        best_acc = accuracy
                    model.train()

                if global_step % t_total == 0:
                    break
        losses.reset()
        if global_step % t_total == 0:
            break

    if args.local_rank in [-1, 0]:
        writer.close()
    logger.info("Best Accuracy: \t%f" % best_acc)
    logger.info("End Training!")
Пример #6
0
def train(args, model):
    if args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir, exist_ok=True)
        writer = SummaryWriter(log_dir=os.path.join("logs", args.name))
        # writer = SummaryWriter(log_dir=os.path.join("logs", 'transformer'))
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
    train_loader, test_loader = get_loader(args)
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.learning_rate,
                                momentum=0.9,
                                weight_decay=args.weight_decay)
    t_total = args.num_steps

    if args.decay_type == "cosine":
        scheduler = WarmupCosineSchedule(optimizer,
                                         warmup_steps=args.warmup_steps,
                                         t_total=t_total)
    else:
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=args.warmup_steps,
                                         t_total=t_total)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Total optimization steps = %d", args.num_steps)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    model.zero_grad()
    set_seed(args)
    losses = AverageMeter()
    global_step, best_acc, best_losses = 0, 0, np.inf
    while True:
        model.train()
        # criterion = nn.CrossEntropyLoss()
        criterion = FocalLoss()
        # criterion = MyCrossEntropyLoss()
        # criterion = MyMseLoss(args)
        epoch_iterator = tqdm(train_loader,
                              desc="Training (X / X Steps) (loss=X.X)",
                              bar_format="{l_bar}{r_bar}",
                              dynamic_ncols=True,
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            batch = tuple(t.to(args.device) for t in batch)
            imgs, intensity, labels = batch
            outputs = model(imgs, intensity,
                            labels)  # output[0].size = (B, N, C)
            outputs = outputs.view(-1, args.class_number)
            # outputs = F.softmax(outputs, dim=-1)
            labels = labels.view(-1)
            loss = criterion(outputs, labels)
            loss.backward()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                losses.update(loss.item() * args.gradient_accumulation_steps)

                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)
                scheduler.step()
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

            epoch_iterator.set_description(
                "Training (%d / %d Steps) (loss=%2.5f)" %
                (global_step, t_total, loss.cpu()))
            if args.local_rank in [-1, 0]:
                writer.add_scalar("train/loss",
                                  scalar_value=losses.val,
                                  global_step=global_step)
                writer.add_scalar("train/lr",
                                  scalar_value=scheduler.get_lr()[0],
                                  global_step=global_step)
            if global_step % args.eval_every == 0:
                accuracy, eval_losses = valid(args, model, writer, test_loader,
                                              global_step)
                if eval_losses <= best_losses:
                    save_model(args, model)
                    best_losses = eval_losses
                model.train()

            if global_step % t_total == 0:
                break
        losses.reset()
        if global_step % t_total == 0:
            break
    if args.local_rank in [-1, 0]:
        writer.close()
    logger.info("Best Accuracy: \t%f" % best_acc)
    logger.info("End Training!")
Пример #7
0
def launch_routine(model):
    data_path = ARGS.data
    if 'valid/mel' in ARGS.data:
        data_path = data_path.replace('valid/mel', '')
    whole_segments = get_loader(data_path, 'valid', HPARAMS, whole=True)
    generate_routine(model, whole_segments, ARGS.out_dir, HPARAMS)
Пример #8
0
                     level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
 # logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s" %
 #                (args.local_rank, args.device, args.n_gpu, bool(args.local_rank != -1), args.fp16))
 # Set seed
 set_seed(args)
 mse0, mse1, mse2, mse3 = 0, 0, 0, 0
 mae0, mae1, mae2, mae3 = 0, 0, 0, 0
 pre = []
 lab = []
 acc = 0.0
 l2 = [10, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 25, 27, 28, 30, 32, 33, 35, 38, 40, 42, 43, 45, 48, 50,
       51, 52, 53, 55, 57, 58, 60, 62, 65, 68, 70, 72]
 l3 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
       29, 30, 31, 32, 33, 34, 35, 36, 37]
 all_preds, all_label = [], []
 train_loader, test_loader = get_loader(args)
 epoch_iterator = tqdm(test_loader,
                       desc="Validating...(loss=X.X",
                       bar_format="{l_bar}{r_bar}",
                       dynamic_ncols=True,
                       disable=args.local_rank not in [-1, 0])
 # criterion = nn.CrossEntropyLoss(ignore_index=0)
 # f = open('./pred_result.txt', 'a')
 test_num = 0
 model.eval()
 eval_losses = AverageMeter()
 for step, batch in enumerate(epoch_iterator):
     batch = tuple(t.to(args.device) for t in batch)
     imgs, intensity, labels = batch
     bs, src_len = labels.size()
     test_target = torch.ones((bs, src_len - 1), dtype=labels.dtype).to(labels.device)
Пример #9
0
def main():
    parser = argparse.ArgumentParser()
    # Required parameters
    parser.add_argument("--name", required=True,
                        help="Name of this run. Used for monitoring.")
    parser.add_argument("--dataset", choices=["cifar10", "cifar100"], default="cifar10",
                        help="Which downstream task.")
    parser.add_argument("--model_type", choices=["ViT-B_16", "ViT-B_32", "ViT-L_16",
                                                 "ViT-L_32", "ViT-H_14", "R50-ViT-B_16"],
                        default="ViT-B_16",
                        help="Which variant to use.")

    parser.add_argument("--test_mode", action="store_true")
    parser.add_argument("--mixup", action="store_true")
    parser.add_argument("--mixup_layer", type=int, default=0)
    parser.add_argument("--mixup_alpha", type=float, default=1.0)

    parser.add_argument("--pretrained_dir", type=str, default="checkpoint/ViT-B_16.npz",
                        help="Where to search for pretrained ViT models.")
    parser.add_argument("--output_dir", default="output", type=str,
                        help="The output directory where checkpoints will be written.")

    parser.add_argument("--img_size", default=224, type=int,
                        help="Resolution size")
    parser.add_argument("--train_batch_size", default=512, type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size", default=64, type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--eval_every", default=100, type=int,
                        help="Run prediction on validation set every so many steps."
                             "Will always run one evaluation at the end of training.")

    parser.add_argument("--learning_rate", default=3e-2, type=float,
                        help="The initial learning rate for SGD.")
    parser.add_argument("--weight_decay", default=0, type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--num_steps", default=10000, type=int,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--decay_type", choices=["cosine", "linear"], default="cosine",
                        help="How to decay the learning rate.")
    parser.add_argument("--warmup_steps", default=500, type=int,
                        help="Step of training to perform learning rate warmup for.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float,
                        help="Max gradient norm.")

    parser.add_argument("--local_rank", type=int, default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed', type=int, default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument('--fp16', action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--fp16_opt_level', type=str, default='O2',
                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
                             "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument('--loss_scale', type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                             "0 (default value): dynamic loss scaling.\n"
                             "Positive power of 2: static loss scaling value.\n")
    args = parser.parse_args()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             timeout=timedelta(minutes=60))
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S',
                        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
                        
    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s" %
                   (args.local_rank, args.device, args.n_gpu, bool(args.local_rank != -1), args.fp16))

    # Set seed
    set_seed(args)

    # Model & Tokenizer Setup
    args, model = setup(args)

    if not args.test_mode:
        # Training
        train(args, model)
    else:
        _, test_loader = get_loader(args)
        global_step = -1
        accuracy = valid(args, model, None, test_loader, global_step)
        print("Test accuracy: ", accuracy)