def train_model(args): from torchvision import datasets, models from tqdm import tqdm imgs, trn_df, vld_df = _get_images(args.train_dir, args.num_folds, args.vld_fold_idx, data_type='train') trn_loader, vld_loader = _get_data_loader(imgs, trn_df, vld_df) logger.info("=== Getting Pre-trained model ===") model = models.resnet18(pretrained=True) last_hidden_units = model.fc.in_features model.fc = torch.nn.Linear(last_hidden_units, 186) # len_buffer = len(list(module.buffers())) # logger.info("=== Buffer ===") # print(f"len_buffer={len_buffer}") # print(list(model.buffers())) # SDP: Pin each GPU to a single process # Use SMDataParallel PyTorch DDP for efficient distributed training model = DDP(model.to(args.device), broadcast_buffers=False) # SDP: Pin each GPU to a single SDP process. torch.cuda.set_device(args.local_rank) model.cuda(args.local_rank) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) loss_fn = nn.CrossEntropyLoss() scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', verbose=True, patience=5, factor=0.5) best_score = -1 training_stats = [] logger.info("=== Start Training ===") for epoch_id in range(args.num_epochs): ################################################################################ # ==> Training phase ################################################################################ trn_loss = [] model.train() # Measure how long the training epoch takes. t0 = time.time() running_loss = 0.0 for batch_id, (inputs, targets) in enumerate((trn_loader)): inputs = inputs.cuda() targets = targets.cuda() targets_gra = targets[:, 0] targets_vow = targets[:, 1] targets_con = targets[:, 2] # 50%의 확률로 원본 데이터 그대로 사용 if np.random.rand() < 0.5: logits = model(inputs) grapheme = logits[:, :168] vowel = logits[:, 168:179] cons = logits[:, 179:] loss1 = loss_fn(grapheme, targets_gra) loss2 = loss_fn(vowel, targets_vow) loss3 = loss_fn(cons, targets_con) else: lam = np.random.beta(1.0, 1.0) rand_index = torch.randperm(inputs.size()[0]) shuffled_targets_gra = targets_gra[rand_index] shuffled_targets_vow = targets_vow[rand_index] shuffled_targets_con = targets_con[rand_index] bbx1, bby1, bbx2, bby2 = _rand_bbox(inputs.size(), lam) inputs[:, :, bbx1:bbx2, bby1:bby2] = inputs[rand_index, :, bbx1:bbx2, bby1:bby2] # 픽셀 비율과 정확히 일치하도록 lambda 파라메터 조정 lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (inputs.size()[-1] * inputs.size()[-2])) logits = model(inputs) grapheme = logits[:, :168] vowel = logits[:, 168:179] cons = logits[:, 179:] loss1 = loss_fn(grapheme, targets_gra) * lam + loss_fn( grapheme, shuffled_targets_gra) * (1. - lam) loss2 = loss_fn(vowel, targets_vow) * lam + loss_fn( vowel, shuffled_targets_vow) * (1. - lam) loss3 = loss_fn(cons, targets_con) * lam + loss_fn( cons, shuffled_targets_con) * (1. - lam) loss = 0.5 * loss1 + 0.25 * loss2 + 0.25 * loss3 trn_loss.append(loss.item()) running_loss += loss.item() loss.backward() optimizer.step() optimizer.zero_grad() # Printing vital information if (batch_id + 1) % (args.log_interval) == 0: s = f'[Epoch {epoch_id} Batch {batch_id+1}/{len(trn_loader)}] ' \ f'loss: {running_loss / args.log_interval:.4f}' print(s) running_loss = 0 # Measure how long this epoch took. trn_time = _format_time(time.time() - t0) if args.rank == 0: ################################################################################ # ==> Validation phase ################################################################################ val_loss = [] val_true = [] val_pred = [] model.eval() # === Validation phase === logger.info('=== Start Validation ===') with torch.no_grad(): for inputs, targets in vld_loader: inputs = inputs.cuda() targets = targets.cuda() logits = model(inputs) grapheme = logits[:, :168] vowel = logits[:, 168:179] cons = logits[:, 179:] loss= 0.5* loss_fn(grapheme, targets[:,0]) + 0.25*loss_fn(vowel, targets[:,1]) + \ 0.25*loss_fn(vowel, targets[:,2]) val_loss.append(loss.item()) grapheme = grapheme.cpu().argmax(dim=1).data.numpy() vowel = vowel.cpu().argmax(dim=1).data.numpy() cons = cons.cpu().argmax(dim=1).data.numpy() val_true.append(targets.cpu().numpy()) val_pred.append(np.stack([grapheme, vowel, cons], axis=1)) val_true = np.concatenate(val_true) val_pred = np.concatenate(val_pred) val_loss = np.mean(val_loss) trn_loss = np.mean(trn_loss) score_g = recall_score(val_true[:, 0], val_pred[:, 0], average='macro') score_v = recall_score(val_true[:, 1], val_pred[:, 1], average='macro') score_c = recall_score(val_true[:, 2], val_pred[:, 2], average='macro') final_score = np.average([score_g, score_v, score_c], weights=[2, 1, 1]) # Printing vital information s = f'[Epoch {epoch_id}] ' \ f'trn_loss: {trn_loss:.4f}, vld_loss: {val_loss:.4f}, score: {final_score:.4f}, ' \ f'score_each: [{score_g:.4f}, {score_v:.4f}, {score_c:.4f}]' print(s) ################################################################################ # ==> Save checkpoint and training stats ################################################################################ if final_score > best_score: best_score = final_score state_dict = model.cpu().state_dict() model = model.cuda() torch.save(state_dict, os.path.join(args.model_dir, 'model.pth')) # Record all statistics from this epoch training_stats.append({ 'epoch': epoch_id + 1, 'trn_loss': trn_loss, 'trn_time': trn_time, 'val_loss': val_loss, 'score': final_score, 'score_g': score_g, 'score_v': score_v, 'score_c': score_c }) # === Save Model Parameters === logger.info("Model successfully saved at: {}".format( args.model_dir))
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=14, metavar='N', help='number of epochs to train (default: 14)') parser.add_argument('--lr', type=float, default=1.0, metavar='LR', help='learning rate (default: 1.0)') parser.add_argument('--gamma', type=float, default=0.7, metavar='M', help='Learning rate step gamma (default: 0.7)') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument('--verbose', action='store_true', default=False, help='For displaying SM Data Parallel-specific logs') parser.add_argument('--data-path', type=str, default='/tmp/data', help='Path for downloading ' 'the MNIST dataset') args = parser.parse_args() args.world_size = dist.get_world_size() args.rank = rank = dist.get_rank() args.local_rank = local_rank = dist.get_local_rank() args.lr = 1.0 args.batch_size //= args.world_size // 8 args.batch_size = max(args.batch_size, 1) data_path = args.data_path if args.verbose: print('Hello from rank', rank, 'of local_rank', local_rank, 'in world size of', args.world_size) if not torch.cuda.is_available(): raise Exception( "Must run SM Distributed DataParallel MNIST example on CUDA-capable devices." ) torch.manual_seed(args.seed) device = torch.device("cuda") if local_rank == 0: train_dataset = datasets.MNIST(data_path, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) else: time.sleep(8) train_dataset = datasets.MNIST(data_path, train=True, download=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=args.world_size, rank=rank) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=True, sampler=train_sampler) if rank == 0: test_loader = torch.utils.data.DataLoader( datasets.MNIST(data_path, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True) model = DDP(Net().to(device)) torch.cuda.set_device(local_rank) model.cuda(local_rank) optimizer = optim.Adadelta(model.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) if rank == 0: test(model, device, test_loader) scheduler.step() if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt")
def sdp_init(model, optimizer, args): model = DDP(model.to(args.device), broadcast_buffers=False) # model = DDP(model, device_ids=[args.rank], broadcast_buffers=False) model.cuda(args.local_rank) return model, optimizer, args
def main(): # Training settings parser = argparse.ArgumentParser(description="PyTorch MNIST Example") parser.add_argument( "--batch-size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)", ) parser.add_argument( "--test-batch-size", type=int, default=1000, metavar="N", help="input batch size for testing (default: 1000)", ) parser.add_argument( "--epochs", type=int, default=14, metavar="N", help="number of epochs to train (default: 14)", ) parser.add_argument("--lr", type=float, default=1.0, metavar="LR", help="learning rate (default: 1.0)") parser.add_argument( "--gamma", type=float, default=0.7, metavar="M", help="Learning rate step gamma (default: 0.7)", ) parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") parser.add_argument( "--log-interval", type=int, default=10, metavar="N", help="how many batches to wait before logging training status", ) parser.add_argument("--save-model", action="store_true", default=False, help="For Saving the current Model") parser.add_argument( "--verbose", action="store_true", default=False, help="For displaying smdistributed.dataparallel-specific logs", ) parser.add_argument( "--data-path", type=str, default="/tmp/data", help="Path for downloading " "the MNIST dataset", ) args = parser.parse_args() args.world_size = dist.get_world_size() args.rank = rank = dist.get_rank() args.local_rank = local_rank = dist.get_local_rank() args.lr = 1.0 args.batch_size //= args.world_size // 8 args.batch_size = max(args.batch_size, 1) data_path = args.data_path if args.verbose: print( "Hello from rank", rank, "of local_rank", local_rank, "in world size of", args.world_size, ) if not torch.cuda.is_available(): raise CUDANotFoundException( "Must run smdistributed.dataparallel MNIST example on CUDA-capable devices." ) torch.manual_seed(args.seed) device = torch.device("cuda") # select a single rank per node to download data is_first_local_rank = local_rank == 0 if is_first_local_rank: train_dataset = datasets.MNIST( data_path, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ) dist.barrier() # prevent other ranks from accessing the data early if not is_first_local_rank: train_dataset = datasets.MNIST( data_path, train=True, download=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=args.world_size, rank=rank) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=True, sampler=train_sampler, ) if rank == 0: test_loader = torch.utils.data.DataLoader( datasets.MNIST( data_path, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ), batch_size=args.test_batch_size, shuffle=True, ) model = DDP(Net().to(device)) torch.cuda.set_device(local_rank) model.cuda(local_rank) optimizer = optim.Adadelta(model.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) if rank == 0: test(model, device, test_loader) scheduler.step() if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt")
def train(cfg, args): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if use_amp: # Initialize mixed-precision training use_mixed_precision = cfg.DTYPE == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) if args.distributed: # if use_apex_ddp: # model = DDP(model, delay_allreduce=True) # else: # SMDataParallel: Wrap the PyTorch model with SMDataParallel’s DDP model = DDP(model, device_ids=[dist.get_local_rank()], broadcast_buffers=False) #model = DDP(model) print("model parameter size: ", sum(p.numel() for p in model.parameters() if p.requires_grad)) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR # SMDataParallel: Save model on master node. save_to_disk = dist.get_rank() == 0 checkpointer = DetectronCheckpointer( cfg, model, optimizer, scheduler, output_dir, save_to_disk ) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader, iters_per_epoch = make_data_loader( cfg, is_train=True, is_distributed=args.distributed, start_iter=arguments["iteration"], data_dir = args.data_dir ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD # set the callback function to evaluate and potentially # early exit each epoch if cfg.PER_EPOCH_EVAL: per_iter_callback_fn = functools.partial( mlperf_test_early_exit, iters_per_epoch=iters_per_epoch, tester=functools.partial(test, cfg=cfg), model=model, distributed=args.distributed, min_bbox_map=cfg.MIN_BBOX_MAP, min_segm_map=cfg.MIN_MASK_MAP) else: per_iter_callback_fn = None do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, use_amp, cfg, per_iter_end_callback_fn=per_iter_callback_fn, ) return model
def train(model, device): """ Train the PyTorch model """ cat_mask=[False,True,True,True,True,False,True,True,True,True,True,False,False,False,False,False,False,False] train_ds = CsvDatasetSimple(args.train) test_ds = CsvDatasetSimple(args.test) batch_size = args.batch_size epochs = args.epochs learning_rate = args.learning_rate logger.info( "batch_size = {}, epochs = {}, learning rate = {}".format(batch_size, epochs, learning_rate) ) train_sampler = torch.utils.data.distributed.DistributedSampler( train_ds, num_replicas=args.world_size, rank=args.rank ) train_dl = DataLoader(train_ds, batch_size, shuffle=False, drop_last=True, sampler=train_sampler) model = TabularNet(n_cont=9, n_cat=9, cat_mask = cat_mask, cat_dim=[0,2050,13,5,366,0,50000,50000,50000,50000,50,0,0,0,0,0,0,0], y_min = 0., y_max = 1., device=device) logger.debug(model) model = DDP(model).to(device) torch.cuda.set_device(args.local_rank) model.cuda(args.local_rank) criterion = nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) model.train() for epoch in range(epochs): batch_no = 0 for x_train_batch, y_train_batch in train_dl: logger.debug(f"Training on shape {x_train_batch.shape}") y = model(x_train_batch.float()) loss = criterion(y.flatten(), y_train_batch.float().to(device)) if batch_no % 50 == 0: logger.info(f"batch {batch_no} -> loss: {loss}") optimizer.zero_grad() loss.backward() optimizer.step() batch_no +=1 epoch += 1 logger.info(f"epoch: {epoch} -> loss: {loss}") # evalutate on test set if args.rank == 0: model.eval() test_dl = DataLoader(test_ds, batch_size, drop_last=True, shuffle=False) with torch.no_grad(): mse = 0. for x_test_batch, y_test_batch in test_dl: y = model(x_test_batch.float()) mse = mse + ((y - y_test_batch.to(device)) ** 2).sum() / x_test_batch.shape[0] mse = mse / len(test_dl.dataset) logger.info(f"Test MSE: {mse}") torch.save(model.state_dict(), args.model_dir + "/model.pth") # PyTorch requires that the inference script must # be in the .tar.gz model file and Step Functions SDK doesn't do this. inference_code_path = args.model_dir + "/code/" if not os.path.exists(inference_code_path): os.mkdir(inference_code_path) logger.info("Created a folder at {}!".format(inference_code_path)) shutil.copy("train_pytorch.py", inference_code_path) shutil.copy("model_pytorch.py", inference_code_path) shutil.copy("csv_loader.py", inference_code_path) logger.info("Saving models files to {}".format(inference_code_path))
def train(args): """ Train the PyTorch model """ cat_mask = [ False, True, True, True, True, False, True, True, True, True, True, False, False, False, False, False, False, False ] train_ds = CsvDatasetSimple(args.train) test_ds = CsvDatasetSimple(args.test) batch_size = args.batch_size epochs = args.epochs learning_rate = args.learning_rate logger.info("batch_size = {}, epochs = {}, learning rate = {}".format( batch_size, epochs, learning_rate)) logger.info(f"World size: {args.world_size}") logger.info(f"Rank: {args.rank}") logger.info(f"Local Rank: {args.local_rank}") train_sampler = torch.utils.data.distributed.DistributedSampler( train_ds, num_replicas=args.world_size, rank=args.rank) logger.debug(f"Created distributed sampler") train_dl = DataLoader(train_ds, batch_size, shuffle=False, num_workers=0, pin_memory=True, drop_last=True, sampler=train_sampler) logger.debug(f"Created train data loader") model = TabularNet(n_cont=9, n_cat=9, cat_mask=cat_mask, cat_dim=[ 0, 2050, 13, 5, 366, 0, 50000, 50000, 50000, 50000, 50, 0, 0, 0, 0, 0, 0, 0 ], y_min=0., y_max=1.) logger.debug("Created model") logger.debug(model) model = DDP(model.to(device), broadcast_buffers=False) logger.debug("created DDP") torch.cuda.set_device(args.local_rank) logger.debug("Set device on CUDA") model.cuda(args.local_rank) logger.debug(f"Set model CUDA to {args.local_rank}") criterion = nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) logger.debug("Created loss fn and optimizer") model.train() for epoch in range(epochs): batch_no = 0 for x_train_batch, y_train_batch in train_dl: logger.debug(f"Working on batch {batch_no}") x_train_batch_d = x_train_batch.to(device) y_train_batch_d = y_train_batch.to(device) optimizer.zero_grad() logger.debug("Did optimizer grad") logger.debug(f"Training on shape {x_train_batch.shape}") y = model(x_train_batch_d.float()) logger.debug("Did forward pass") loss = criterion(y.flatten(), y_train_batch_d.float()) logger.debug("Got loss") if batch_no % 50 == 0: logger.info(f"batch {batch_no} -> loss: {loss}") loss.backward() logger.debug("Did backward step") optimizer.step() logger.debug(f"Did optimizer step, batch {batch_no}") batch_no += 1 epoch += 1 logger.info(f"epoch: {epoch} -> loss: {loss}") # evalutate on test set if args.rank == 0: logger.info(f"Starting test eval on rank 0") model.eval() test_dl = DataLoader(test_ds, batch_size, drop_last=True, shuffle=False) logger.info(f"Loaded test data set") with torch.no_grad(): mse = 0. batch_no = 0 for x_test_batch, y_test_batch in test_dl: x_test_batch_d = x_test_batch.to(device) y_test_batch_d = y_test_batch.to(device) y = model(x_test_batch_d.float()) mse += F.mse_loss(y, y_test_batch_d, reduction="sum") #mse = mse + ((y - y_test_batch_d) ** 2).sum() / x_test_batch.shape[0] if batch_no % 50 == 0: logger.info(f"batch {batch_no} -> MSE: {mse}") batch_no += 1 mse = mse / len(test_dl.dataset) logger.info(f"Test MSE: {mse}") if args.rank == 0: logger.info("Saving model on rank 0") torch.save(model.state_dict(), args.model_dir + "/model.pth")