예제 #1
0
def testcase_StepLR_fused(B=3, step_size=2, gamma=0.1, last_epoch=-1):
  lr = random.choice([torch.rand((B,)), random.random()])
  net_array = [_TestNet() for _ in range(B)]
  net_fused = _TestNet(B=B)
  optimizer_array = [
      optim.Adadelta(
          net_array[b].parameters(),
          lr=index_array_or_return_scalar(lr, b),
      ) for b in range(B)
  ]
  optimizer_fused = get_hfta_optim_for(optim.Adadelta, B=B)(
      net_fused.parameters(),
      lr=lr,
  )
  if not isinstance(last_epoch, int) or last_epoch != -1:
    _init_initial_lr(optimizer_fused, optimizer_array)
  lr_scheduler_array = [
      lr_scheduler.StepLR(
          optimizer_array[b],
          index_array_or_return_scalar(step_size, b),
          gamma=index_array_or_return_scalar(gamma, b),
          last_epoch=index_array_or_return_scalar(last_epoch, b),
      ) for b in range(B)
  ]
  lr_scheduler_fused = get_hfta_lr_scheduler_for(lr_scheduler.StepLR, B=B)(
      optimizer_fused,
      step_size,
      gamma=gamma,
      last_epoch=last_epoch,
  )
  _lr_scheduler_testing_procedure(net_fused, net_array, optimizer_fused,
                                  optimizer_array, lr_scheduler_fused,
                                  lr_scheduler_array)
예제 #2
0
def testcase_fused(
        B=3,
        lr=1.0,
        rho=0.9,
        eps=1e-6,
        weight_decay=0,
        device=torch.device('cpu'),
        dtype=torch.float,
):
    if B > 1 and isinstance(lr, (int, float)):
        lr = [random.uniform(0.5, 2.0) for _ in range(B)]

    kwargs = {'device': device, 'dtype': dtype}
    net_array = [_TestNet(**kwargs) for _ in range(B)]
    net_fused = _TestNet(B=B, **kwargs)
    optimizer_array = [
        optim.Adadelta(
            net_array[b].parameters(),
            lr=index_array_or_return_scalar(lr, b),
            rho=index_array_or_return_scalar(rho, b),
            eps=index_array_or_return_scalar(eps, b),
            weight_decay=index_array_or_return_scalar(weight_decay, b),
        ) for b in range(B)
    ]
    optimizer_fused = get_hfta_optim_for(optim.Adadelta, B=B)(
        net_fused.parameters(),
        lr=lr,
        rho=rho,
        eps=eps,
        weight_decay=weight_decay,
    )
    _optim_testing_procedure(net_fused, net_array, optimizer_fused,
                             optimizer_array)
예제 #3
0
def testcase_partially_fused(
        B=3,
        amsgrad=False,
        device=torch.device('cpu'),
        dtype=torch.float,
):
    kwargs = {'device': device, 'dtype': dtype}
    net_array = [_TestNet(**kwargs) for _ in range(B)]
    net_fused = _TestNet(B=B, partially_fused=True, **kwargs)
    lr = [random.uniform(1e-4, 1e-2) for _ in range(B)]
    betas = (
        [random.uniform(0.8, 0.99) for _ in range(B)],
        [random.uniform(0.998, 0.9999) for _ in range(B)],
    )
    eps = [random.uniform(1e-9, 1e-7) for _ in range(B)]
    weight_decay = [random.uniform(0.0, 0.3) for _ in range(B)]
    optimizer_array = [
        optim.Adam(
            net_array[b].parameters(),
            lr=index_array_or_return_scalar(lr, b),
            betas=(
                index_array_or_return_scalar(betas[0], b),
                index_array_or_return_scalar(betas[1], b),
            ),
            eps=index_array_or_return_scalar(eps, b),
            weight_decay=index_array_or_return_scalar(weight_decay, b),
            amsgrad=amsgrad,
        ) for b in range(B)
    ]
    partially_fused_optimizer = get_hfta_optim_for(
        optim.Adam,
        B=B,
        partially_fused=True,
    )(
        net_fused.parameters(),
        net_fused.unfused_parameters(),
        lr=lr,
        betas=betas,
        eps=eps,
        weight_decay=weight_decay,
        amsgrad=amsgrad,
        B=B,
    )
    _optim_testing_procedure(net_fused, net_array, partially_fused_optimizer,
                             optimizer_array)
예제 #4
0
def testcase_StepLR_partially_fused(B=3):
  net_array = [_TestNet() for _ in range(B)]
  net_fused = _TestNet(B=B, partially_fused=True)
  lr = [random.uniform(0.5, 2.0) for _ in range(B)]
  step_size = [random.randint(2, 8) for _ in range(B)]
  gamma = [random.uniform(0.1, 0.3) for _ in range(B)]
  last_epoch = [random.randint(5, 11) for _ in range(B)]
  optimizer_array = [
      optim.Adadelta(
          net_array[b].parameters(),
          lr=index_array_or_return_scalar(lr, b),
      ) for b in range(B)
  ]
  optimizer_partially_fused = get_hfta_optim_for(
      optim.Adadelta,
      B=B,
      partially_fused=True,
  )(
      net_fused.parameters(),
      net_fused.unfused_parameters(),
      lr=lr,
  )
  _init_initial_lr(optimizer_partially_fused, optimizer_array)
  lr_scheduler_array = [
      lr_scheduler.StepLR(
          optimizer_array[b],
          index_array_or_return_scalar(step_size, b),
          gamma=index_array_or_return_scalar(gamma, b),
          last_epoch=index_array_or_return_scalar(last_epoch, b),
      ) for b in range(B)
  ]
  lr_scheduler_partially_fused = get_hfta_lr_scheduler_for(
      lr_scheduler.StepLR,
      B=B,
      partially_fused=True,
  )(
      optimizer_partially_fused,
      step_size,
      gamma=gamma,
      last_epoch=last_epoch,
  )
  _lr_scheduler_testing_procedure(net_fused, net_array, optimizer_fused,
                                  optimizer_array, lr_scheduler_fused,
                                  lr_scheduler_array)
예제 #5
0
def testcase_fused(
        B=3,
        lr=1e-3,
        betas=(0.9, 0.999),
        eps=1e-8,
        weight_decay=0,
        amsgrad=False,
        device=torch.device('cpu'),
        dtype=torch.float,
):
    if B > 1 and isinstance(lr, (int, float)):
        lr = [random.uniform(1e-4, 1e-2) for _ in range(B)]

    kwargs = {'device': device, 'dtype': dtype}
    net_array = [_TestNet(**kwargs) for _ in range(B)]
    net_fused = _TestNet(B=B, **kwargs)
    optimizer_array = [
        optim.Adam(
            net_array[b].parameters(),
            lr=index_array_or_return_scalar(lr, b),
            betas=(
                index_array_or_return_scalar(betas[0], b),
                index_array_or_return_scalar(betas[1], b),
            ),
            eps=index_array_or_return_scalar(eps, b),
            weight_decay=index_array_or_return_scalar(weight_decay, b),
            amsgrad=amsgrad,
        ) for b in range(B)
    ]
    optimizer_fused = get_hfta_optim_for(optim.Adam, B=B)(
        net_fused.parameters(),
        lr=lr,
        betas=betas,
        eps=eps,
        weight_decay=weight_decay,
        amsgrad=amsgrad,
    )
    _optim_testing_procedure(net_fused, net_array, optimizer_fused,
                             optimizer_array)
예제 #6
0
def testcase_partially_fused(
        B=3,
        device=torch.device('cpu'),
        dtype=torch.float,
):
    kwargs = {'device': device, 'dtype': dtype}
    net_array = [_TestNet(**kwargs) for _ in range(B)]
    net_fused = _TestNet(B=B, partially_fused=True, **kwargs)
    lr = [random.uniform(0.5, 2.0) for _ in range(B)]
    rho = [random.uniform(0.7, 0.99) for _ in range(B)]
    eps = [random.uniform(1e-7, 1e-5) for _ in range(B)]
    weight_decay = [random.uniform(0.0, 0.3) for _ in range(B)]
    optimizer_array = [
        optim.Adadelta(
            net_array[b].parameters(),
            lr=index_array_or_return_scalar(lr, b),
            rho=index_array_or_return_scalar(rho, b),
            eps=index_array_or_return_scalar(eps, b),
            weight_decay=index_array_or_return_scalar(weight_decay, b),
        ) for b in range(B)
    ]
    partially_fused_optimizer = get_hfta_optim_for(
        optim.Adadelta,
        B=B,
        partially_fused=True,
    )(
        net_fused.parameters(),
        net_fused.unfused_parameters(),
        lr=lr,
        rho=rho,
        eps=eps,
        weight_decay=weight_decay,
        B=B,
    )
    _optim_testing_procedure(net_fused, net_array, partially_fused_optimizer,
                             optimizer_array)
예제 #7
0
파일: main.py 프로젝트: UofT-EcoSystem/hfta
def main(args):
  print(args)
  random.seed(args.seed)
  np.random.seed(args.seed)
  torch.manual_seed(args.seed)
  track_running_stats = (args.device != 'xla')
  if args.device == 'cuda':
    assert torch.cuda.is_available()
    torch.backends.cudnn.benchmark = True
    print('Enable cuDNN heuristics!')

  device = (torch.device(args.device)
            if args.device in {'cpu', 'cuda'} else xm.xla_device())
  if args.device == 'cuda' and args.amp:
    scaler = amp.GradScaler()
  else:
    scaler = None

  train_loader, test_loader = init_dataloader(args)

  B = len(args.lr) if args.hfta else 0

  model = Resnet18(num_classes=10, B=B,
                   track_running_stats=track_running_stats).to(device)
  if not args.convergence_test:
    if B == 0 and args.save_init_model:
      torch.save(model, args.model_dir)
      print("model saved! exiting...")
      exit(0)
    if args.load_init_model:
      model.init_load([args.model_dir] * max(1, B))
  print('B={} lr={}'.format(B, args.lr))

  optimizer = get_hfta_optim_for(optim.Adadelta, B=B)(
      model.parameters(),
      lr=args.lr if B > 0 else args.lr[0],
  )

  all_losses = []
  epoch_timer = EpochTimer()
  for epoch in range(args.epochs):
    epoch_timer.epoch_start(epoch)
    num_samples_per_epoch, epoch_losses = train(args,
                                                model,
                                                device,
                                                train_loader,
                                                optimizer,
                                                epoch,
                                                B,
                                                save_loss=args.convergence_test,
                                                scaler=scaler)
    epoch_timer.epoch_stop(num_samples_per_epoch)
    if args.convergence_test:
      all_losses.append(epoch_losses)
    print('Epoch {} took {} s!'.format(epoch, epoch_timer.epoch_latency(epoch)))

  if args.convergence_test:
    all_losses = torch.cat(all_losses, 0).transpose(0, 1).cpu().numpy()
    print(all_losses.shape)
    loss_dict = {}
    for i, lr in enumerate(args.lr):
      loss_dict[lr] = all_losses[i]
    data = pd.DataFrame(loss_dict)
    data.to_csv(os.path.join(args.outf, "convergence.csv"))
  else:
    if args.device == 'xla':
      print(met.metrics_report())
    if args.outf is not None:
      epoch_timer.to_csv(args.outf)

  if args.eval:
    test(model, device, test_loader, B)
  print('All jobs Finished!')
예제 #8
0
파일: main.py 프로젝트: UofT-EcoSystem/hfta
model = model.TransformerModel(ntokens,
                               args.emsize,
                               args.nhead,
                               args.nhid,
                               args.nlayers,
                               args.dropout,
                               B=B).to(device)

if args.device == 'cuda' and args.amp:
    scaler = amp.GradScaler()
else:
    scaler = None

# Loop over epochs.
optimizer = get_hfta_optim_for(optim.Adadelta, B=B)(
    model.parameters(),
    lr=args.lr if B > 0 else args.lr[0],
)

scheduler = get_hfta_lr_scheduler_for(optim.lr_scheduler.StepLR, B=B)(
    optimizer,
    step_size=args.step_size if B > 0 else args.step_size[0],
    gamma=args.gamma if B > 0 else args.gamma[0],
)

print("NVIDIA_TF32_OVERRIDE: {}".format(
    os.environ.get('NVIDIA_TF32_OVERRIDE')))

epoch_timer = EpochTimer()
print("start training!")
for epoch in range(1, args.epochs + 1):
    epoch_timer.epoch_start(epoch)
예제 #9
0
netD.apply(weights_init)
if args.netD != '':
    netD.load_state_dict(torch.load(args.netD))
print(netD)

criterion = nn.BCEWithLogitsLoss()

if B > 0:
    fixed_noise = torch.randn(args.batchSize, B, nz, 1, 1, device=device)
else:
    fixed_noise = torch.randn(args.batchSize, nz, 1, 1, device=device)
real_label = 1
fake_label = 0

# setup optimizer
Adam = get_hfta_optim_for(optim.Adam, B=B)
optimizerD = Adam(netD.parameters(), lr=args.lr, betas=(args.beta1, 0.999))
optimizerG = Adam(netG.parameters(), lr=args.lr, betas=(args.beta1, 0.999))

if args.device == 'cuda' and args.amp:
    scaler = amp.GradScaler()

if args.dry_run:
    args.epochs = 1


def loss_fn(output, label, batch_size):
    if B > 0:
        return B * criterion(output.view(batch_size * B), label)
    else:
        return criterion(output, label)
예제 #10
0
blue = lambda x: '\033[94m' + x + '\033[0m'

classifier = PointNetDenseCls(
    k=num_classes,
    feature_transform=opt.feature_transform,
    B=B,
    track_running_stats=(opt.device != 'xla'),
)

if opt.model != '':
    classifier.load_state_dict(torch.load(opt.model))

optimizer = get_hfta_optim_for(optim.Adam, B=B)(
    classifier.parameters(),
    lr=opt.lr,
    betas=(opt.beta1, opt.beta2),
    weight_decay=opt.weight_decay,
)
scheduler = get_hfta_lr_scheduler_for(optim.lr_scheduler.StepLR, B=B)(
    optimizer,
    step_size=opt.step_size,
    gamma=opt.gamma,
)

scaler = amp.GradScaler(enabled=(opt.device == 'cuda' and opt.amp))

classifier.to(device)

num_batch = len(dataloader)

예제 #11
0
def main(args):
    print(args)
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    track_running_stats = (args.device != 'xla')
    if args.device == 'cuda':
        assert torch.cuda.is_available()
        torch.backends.cudnn.benchmark = True
        print('Enable cuDNN heuristics!')

    device = (torch.device(args.device)
              if args.device in {'cpu', 'cuda'} else xm.xla_device())
    if args.device == 'cuda' and args.amp:
        scaler = amp.GradScaler()
    else:
        scaler = None

    train_loader, test_loader = init_dataloader(args)

    B = len(args.lr) if args.hfta else 0
    model_config = generate_partially_fused_config(args.serial_num)
    print("Model config:", model_config)

    normal_block = str_to_class(model_config["normal_block"])
    serial_block = str_to_class(model_config["serial_block"])
    model = PartiallyFusedResNet(
        model_config["arch"],
        normal_block,
        serial_block,
        num_classes=10,
        B=B,
        track_running_stats=track_running_stats,
    ).to(device)

    if len(model.unfused_layers) > 0:
        model.unfused_to(device)
        optimizer = get_hfta_optim_for(optim.Adadelta,
                                       B=B,
                                       partially_fused=True)(
                                           model.parameters(),
                                           model.get_unfused_parameters(),
                                           lr=args.lr if B > 0 else args.lr[0],
                                       )
    else:
        optimizer = get_hfta_optim_for(optim.Adadelta, B=B)(
            model.parameters(),
            lr=args.lr if B > 0 else args.lr[0],
        )

    epoch_timer = EpochTimer()
    for epoch in range(args.epochs):
        epoch_timer.epoch_start(epoch)
        num_samples_per_epoch, _ = train(args,
                                         model,
                                         device,
                                         train_loader,
                                         optimizer,
                                         epoch,
                                         B,
                                         scaler=scaler)
        epoch_timer.epoch_stop(num_samples_per_epoch)
        print('Epoch {} took {} s!'.format(epoch,
                                           epoch_timer.epoch_latency(epoch)))

    if args.device == 'xla':
        print(met.metrics_report())
    if args.outf is not None:
        epoch_timer.to_csv(args.outf)

    if args.eval:
        test(model, device, test_loader, B)
    print('All jobs Finished!')
예제 #12
0
파일: main.py 프로젝트: UofT-EcoSystem/hfta
def main(args):
  _seeding(args)

  _mkdir_outf(args)

  device = _create_device_handle(args)

  scaler = _create_scaler(args)

  train_loader, test_loader, num_classes = _create_dataloaders(args)

  if args.hfta:
    B = consolidate_hyperparams_and_determine_B(
        args,
        ['lr', 'beta1', 'beta2', 'weight_decay', 'gamma', 'step_size'],
    )
  else:
    B = 0
    (args.lr, args.beta1, args.beta2, args.weight_decay, args.gamma,
     args.step_size) = (args.lr[0], args.beta1[0], args.beta2[0],
                        args.weight_decay[0], args.gamma[0], args.step_size[0])

  model = _get_model_constructor(args)(
      num_classes=num_classes,
      B=B,
      track_running_stats=(args.device != 'xla'),
  ).to(device)
  criterion = nn.CrossEntropyLoss()
  optimizer = get_hfta_optim_for(optim.Adam, B=B)(
      model.parameters(),
      lr=args.lr,
      betas=(args.beta1, args.beta2),
      weight_decay=args.weight_decay,
  )
  scheduler = get_hfta_lr_scheduler_for(optim.lr_scheduler.StepLR, B=B)(
      optimizer,
      step_size=args.step_size,
      gamma=args.gamma,
  )

  epoch_timer = EpochTimer()

  for epoch in range(args.epochs):
    epoch_timer.epoch_start(epoch)
    num_samples_done = train(args, model, criterion, optimizer, scaler, device,
                             train_loader, epoch, B)
    scheduler.step()

    epoch_timer.epoch_stop(num_samples_done)
    print('Epoch {} took {} s!'.format(epoch, epoch_timer.epoch_latency(epoch)))

  if args.device == 'xla':
    print(met.metrics_report())

  if args.outf is not None:
    epoch_timer.to_csv(args.outf)

  if args.eval:
    acc_top1, acc_top5 = test(args, model, device, test_loader, B)
    if args.outf is not None:
      pd.DataFrame({
          'acc:top1': acc_top1,
          'acc:top5': acc_top5,
      }).to_csv(os.path.join(args.outf, 'eval.csv'))
    return acc_top1, acc_top5
예제 #13
0
def main(args):
    blue = lambda x: '\033[94m' + x + '\033[0m'

    seeding(args.seed)

    if args.hfta:
        B = consolidate_hyperparams_and_determine_B(
            args,
            ['lr', 'beta1', 'beta2', 'weight_decay', 'gamma', 'step_size'],
        )
    else:
        B = 0
        (args.lr, args.beta1, args.beta2, args.weight_decay, args.gamma,
         args.step_size) = (args.lr[0], args.beta1[0], args.beta2[0],
                            args.weight_decay[0], args.gamma[0],
                            args.step_size[0])

    if args.device == 'cuda':
        assert torch.cuda.is_available()
        torch.backends.cudnn.benchmark = True
        print('Enable cuDNN heuristics!')
    device = (xm.xla_device()
              if args.device == 'xla' else torch.device(args.device))

    dataset, test_dataset = build_dataset(args)
    dataloader, testdataloader = build_dataloader(args, dataset, test_dataset)

    print('len(dataset)={}'.format(len(dataset)),
          'len(test_dataset)={}'.format(len(test_dataset)))
    num_classes = len(dataset.classes)
    print('classes', num_classes)

    if args.outf is not None:
        try:
            os.makedirs(args.outf)
        except OSError:
            pass

    classifier = PointNetCls(
        k=num_classes,
        feature_transform=args.feature_transform,
        B=B,
        track_running_stats=(args.device != 'xla'),
    )

    if args.model != '':
        classifier.load_state_dict(torch.load(args.model))

    optimizer = get_hfta_optim_for(optim.Adam, B=B)(
        classifier.parameters(),
        lr=args.lr,
        betas=(args.beta1, args.beta2),
        weight_decay=args.weight_decay,
    )
    scheduler = get_hfta_lr_scheduler_for(optim.lr_scheduler.StepLR, B=B)(
        optimizer,
        step_size=args.step_size,
        gamma=args.gamma,
    )

    scaler = amp.GradScaler(enabled=(args.device == 'cuda' and args.amp))

    classifier.to(device)

    num_batch = len(dataloader)

    def loss_fn(output, label, batch_size, trans_feat):
        if B > 0:
            loss = B * F.nll_loss(output.view(B * batch_size, -1), label)
        else:
            loss = F.nll_loss(output, label)
        if args.feature_transform:
            loss += feature_transform_regularizer(trans_feat) * 0.001
        return loss

    classifier = classifier.train()
    epoch_timer = EpochTimer()

    # Training loop
    for epoch in range(args.epochs):
        num_samples_per_epoch = 0
        epoch_timer.epoch_start(epoch)
        for i, data in enumerate(dataloader, 0):
            if i > args.iters_per_epoch:
                break
            if args.warmup_data_loading:
                continue

            points, target = data
            target = target[:, 0]
            points, target = points.to(device), target.to(device)
            N = points.size(0)
            if B > 0:
                points = points.unsqueeze(0).expand(B, -1, -1, -1).contiguous()
                target = target.repeat(B)
            optimizer.zero_grad(set_to_none=True)
            if args.device == 'cuda':
                with amp.autocast(enabled=args.amp):
                    pred, trans, trans_feat = classifier(points)
                    loss = loss_fn(pred, target, N, trans_feat)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
            else:
                pred, trans, trans_feat = classifier(points)
                loss = loss_fn(pred, target, N, trans_feat)
                loss.backward()
                if args.device == 'xla':
                    xm.optimizer_step(optimizer, barrier=True)
                else:
                    optimizer.step()

            print('[{}: {}/{}] train loss: {}'.format(epoch, i, num_batch,
                                                      loss.item()))
            num_samples_per_epoch += N * max(B, 1)
            scaler.update()
        scheduler.step()
        epoch_timer.epoch_stop(num_samples_per_epoch)
        print('Epoch {} took {} s!'.format(epoch,
                                           epoch_timer.epoch_latency(epoch)))

    if args.device == 'xla' and not args.eval:
        print(met.metrics_report())
    if args.outf is not None:
        epoch_timer.to_csv(args.outf)

    if args.eval:
        # Run validation loop.
        print("Running validation loop ...")
        classifier = classifier.eval()
        with torch.no_grad():
            total_correct = torch.zeros(max(B, 1), device=device)
            total_testset = 0
            for data in testdataloader:
                if args.warmup_data_loading:
                    continue
                points, target = data
                target = target[:, 0]
                points, target = points.to(device), target.to(device)
                N = points.size(0)
                if B > 0:
                    points = points.unsqueeze(0).expand(B, -1, -1,
                                                        -1).contiguous()
                    target = target.repeat(B)
                pred, _, _ = classifier(points)
                pred_choice = pred.argmax(-1)

                correct = pred_choice.eq(
                    target.view(B, N) if B > 0 else target).sum(-1)

                total_correct.add_(correct)
                total_testset += N

            final_accuracy = total_correct / total_testset
            final_accuracy = final_accuracy.cpu().tolist()
            if args.outf is not None:
                pd.DataFrame({
                    'acc': final_accuracy
                }).to_csv(os.path.join(args.outf, 'eval.csv'))

            # Return test_accuracy
            return final_accuracy