def Objective(trial): dim = trial.suggest_categorical('dim', [32, 64, 128]) #patch_size = trial.suggest_int('patch_size',7, 14, 7) patch_size = 7 depth = trial.suggest_categorical('depth', [8, 16, 32]) heads = trial.suggest_categorical('heads', [8, 16, 32]) mlp_dim = trial.suggest_categorical('mlp_dim', [128, 512, 1024]) optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop"]) lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True) print('dim:', dim, 'mlp_dim:', mlp_dim, 'depth:', depth, 'heads:', heads) model = ViT( dim=dim, image_size=28, patch_size=patch_size, num_classes=10, depth=depth, # number of transformer blocks heads=heads, # number of multi-channel attention mlp_dim=mlp_dim, channels=1, #dropout=0.2, ) # vanila cnn : 0.96 # model = Net() model.to(device) criterion = nn.CrossEntropyLoss() # optimizer #optimizer = optim.Adam(model.parameters(), lr=0.001) optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr) # scheduler scheduler = StepLR(optimizer, step_size=1, gamma=gamma) for epoch in range(1, epochs + 1): train(model, criterion, device, train_loader, optimizer, epoch) val_acc = test(model, device, test_loader) scheduler.step() if 0: torch.save(model.state_dict(), "mnist_cnn.pt") trial.report(val_acc, epoch) # Handle pruning based on the intermediate value. if trial.should_prune(): raise optuna.exceptions.TrialPruned() wandb.log({'val_acc': val_acc}) return val_acc
update_freq_schedule=args.kfac_update_freq_schedule) else: preconditioner = None print(f"======== optimizer={optimizer}\n\n======== MODEL={model.name_()}\n======== preconditioner={preconditioner}") # KFAC guarentees grads are equal across ranks before opt.step() is called # so if we do not use kfac we need to wrap the optimizer with horovodcon if isHVD: compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Average, backward_passes_per_step=args.batches_per_allreduce) hvd.broadcast_optimizer_state(optimizer, root_rank=0) hvd.broadcast_parameters(model.state_dict(), root_rank=0) if len(lr_scheduler)==0: #...5...[100, 150]... lrs = create_lr_schedule(num_replicas, args.warmup_epochs, args.lr_decay) lr_scheduler = [LambdaLR(optimizer, lrs)] if use_kfac: lr_scheduler.append(LambdaLR(preconditioner, lrs)) for ls in lr_scheduler: print(f"======== lr_scheduler={ls.state_dict()}") start = time.time() for epoch in range(config.epochs): train(epoch) test(epoch)
def main(): parser = argparse.ArgumentParser(description='ViT') parser.add_argument('--data_dir', default='data/sph_dogs_vs_cats') parser.add_argument('--dataset', default='dvsc') parser.add_argument('--exp_id', default='sdvsc-adam') parser.add_argument('--mode', default='normal') parser.add_argument('--batch', default=128) parser.add_argument('--epochs', default=10) parser.add_argument('--cuda', default=True) parser.add_argument('--optim', default='SGD') args = parser.parse_args() os.system('mkdir -p weights') dataset = {'smnist': SMNIST, 'dvsc': DVSC} if args.dataset == 'smnist': image_size = 60 patch_size = 10 num_classes = 10 samp = 6 elif args.dataset == 'dvsc': image_size = 384 patch_size = 32 num_classes = 2 samp = 12 if args.mode == 'normal': model = ViT(image_size=image_size, patch_size=patch_size, num_classes=num_classes, dim=512, depth=4, heads=8, mlp_dim=512, dropout=0.1, emb_dropout=0.1) else: model = ViT_sphere( image_size=image_size, patch_size=patch_size, num_classes=num_classes, dim=512, depth=4, heads=8, mlp_dim=512, base_order=1, mode=args.mode, # face, vertex and regular samp=samp, dropout=0.1, emb_dropout=0.1) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print("Trainable parameters", params) cuda = args.cuda epochs = args.epochs batch = args.batch path = 'weights/' train_data = dataset[args.dataset](args.data_dir, 'train', image_size, image_size, None) valid_data = dataset[args.dataset](args.data_dir, 'valid', image_size, image_size, None) train_loader = DataLoader(dataset=train_data, batch_size=batch, shuffle=True) valid_loader = DataLoader(dataset=valid_data, batch_size=batch, shuffle=True) if cuda: model = model.cuda() model.train() if args.optim == 'SGD': optimizer = optim.SGD(model.parameters(), lr=1e-3, momentum=0.9) else: optimizer = optim.Adam(model.parameters(), lr=1e-3) #, momentum=0.9) cla_loss = torch.nn.CrossEntropyLoss() valid_loss = 1000 valid_acc = 0 print("Training Start") T_L = [] V_L = [] V_a = [] for i in range(epochs): print("Epoch", i + 1) model.train() L = [] for i, data in enumerate(tqdm(train_loader)): img, target = data if cuda: img = img.cuda() target = target.cuda() preds = model(img) output = cla_loss(preds, target) L.append(output.cpu().item()) output.backward() optimizer.step() optimizer.zero_grad() T_L.append(np.mean(L)) print("train loss:", np.mean(L)) sum_acc = 0 total = len(valid_data) model.eval() for i, data in enumerate(tqdm(valid_loader)): img, target = data if cuda: img = img.cuda() target = target.cuda() preds = model(img) L.append(cla_loss(preds, target).item()) probabilities = torch.nn.functional.softmax(preds, dim=1) preds = torch.argmax(probabilities, dim=1) acc = torch.sum( torch.where(preds == target, torch.tensor(1, device=preds.device), torch.tensor(0, device=preds.device))) sum_acc += acc v_l = np.mean(L) v_a = sum_acc.item() / total * 100 if v_a > valid_acc: valid_acc = v_a torch.save( { 'epoch': epochs, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), }, path + args.exp_id + 'model_acc.pth') if v_l < valid_loss: valid_loss = v_l torch.save( { 'epoch': epochs, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), }, path + args.exp_id + 'model_loss.pth') V_L.append(v_l) V_a.append(v_a) print("val loss:", v_l) print("val acc:", v_a) print(T_L) plt.plot(T_L, label='Total_loss', color='blue') plt.plot(V_L, label='Valid_loss', color='red') plt.legend(loc="upper left") plt.xlabel("num of epochs") plt.ylabel("loss") plt.savefig(path + args.exp_id + 'Learning_Curves.png') plt.clf() plt.plot(V_a, label='Valid_acc', color='cyan') plt.legend(loc="upper left") plt.xlabel("num of epochs") plt.ylabel("accuracy") plt.savefig(path + args.exp_id + 'Val_acc.png') torch.save( { 'epoch': epochs, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), }, path + args.exp_id + 'model_last.pth')
test_preds = v(test_img).view(-1, 32, 2) for k in range(test_preds.shape[0]): test_pred, test_pred_label = confident_strategy(test_preds[k]) all_test_preds.append(test_pred_label) all_labels.append(int(test_label[k].cpu())) total_test_loss += bce(test_pred, test_label[k]).detach() #total_test_loss += bce(sigmoid(test_pred), test_label[k]).detach() TP_test, FN_test, FP_test, TN_test = confusion_matrix_c(test_pred_label, int(test_label[k].cpu()), TP_test, FN_test, FP_test, TN_test) ''' test_loss = criterion(test_preds, test_label) test_output = torch.argmax(test_preds, dim=1) test_correct = (test_output == test_label).float().sum() ''' if best_log_loss > (total_test_loss/len(test_dataloader)): torch.save(v.state_dict(), 'best_model.pt') best_log_loss = (total_test_loss/len(test_dataloader)) test_accuracy = (TP_test + TN_test) / (TP_test + FN_test + FP_test + TN_test + 2e-5) test_precision = TP_test / (TP_test + FP_test + 2e-5) test_recall = TP_test / (TP_test + FN_test + 2e-5) test_f1_score = 2 * ((test_precision * test_recall) / (test_precision + test_recall + 2e-5)) # writer print("{} Test Log Loss: {:.3f}, Accuracy: {:.3f}, Precision: {:.3f}, Recall: {:.3f}".format(i+1, (total_test_loss/len(test_dataloader)), test_accuracy, test_precision, test_recall)) writer.add_scalar('test_epoch_log_loss', (total_test_loss/len(test_dataloader)), i + 1) writer.add_scalar('test_epoch_accuracy', test_accuracy, i + 1) writer.add_scalar('test_epoch_precision', test_precision, i + 1)
val_loss += criterion(val_preds, val_labels) ##### TP, FN, FP, TN ##### TP_val, FN_val, FP_val, TN_val = confusion_matrix_c( val_preds, val_labels) total_acc += TP_val + TN_val val_accuracy = (TP_val + TN_val) / (TP_val + FN_val + FP_val + TN_val) print("[Validation] {} Loss: {:.3f}, Accuracy: {:.3f}". format(k, val_loss.data, val_accuracy)) ##### Save best model #####: total_acc /= len(val_set) if best_acc < total_acc: torch.save(v.state_dict(), 'best_model+' + str(iter) + '.pt') best_acc = total_acc print("===> Best model saved in epoch:", i, ", iter:", iter, ", acc:", total_acc) # writer # writer.add_scalar('test_epoch_loss', test_loss.data, j) writer.add_scalar('val_accuracy', total_acc, iter) # writer.add_scalar('test_epoch_precision', test_precision, j) # writer.add_scalar('test_epoch_recall', test_recall, j) # writer.add_scalar('test_epoch_f1score', test_f1_score, j) # writer.add_hparams({"test_TP": TP_test, "test_TN": TN_test, "test_FP": FP_test, "test_FN": FN_test}) train_iter = iter ##### TEST #####
patch_size = 16 num_layers = 12 # print(pretain_tf_model.keys()) # print_size(pretain_tf_model['pre_logits']) v = ViT(image_size=input_size, patch_size=patch_size, num_classes=1000, depth=num_layers, heads=12, mlp_dim=3072, dropout=0.1, emb_dropout=0.1) print("Model's state_dict:") for param_tensor in v.state_dict(): print(param_tensor, "\t", v.state_dict()[param_tensor].size()) ## copy embedding tf_dict = {} embedding_weight_shape = pretain_tf_model['embedding']['kernel'].shape embedding_weight = np.array( jnp.transpose(pretain_tf_model['embedding']['kernel'], (3, 2, 0, 1))) # embedding_weight = pretain_tf_model['embedding']['kernel'].reshape([embedding_weight_shape[3],embedding_weight_shape[2],embedding_weight_shape[1],embedding_weight_shape[0]]) tf_dict['embedding.weight'] = torch.from_numpy(embedding_weight) tf_dict['embedding.bias'] = torch.from_numpy( pretain_tf_model['embedding']['bias']) ## copy mlp_head