def main(): try: checkpoint = torch.load(config.PATH_TO_CHECKPOINT, map_location=torch.device('cpu')) start_epoch = checkpoint['epoch'] + 1 print('\nLoaded checkpoint from epoch %d.\n' % start_epoch) model = checkpoint['model'] optimizer = checkpoint['optimizer'] except FileNotFoundError: print('PATH_TO_CHECKPOINT not specified in SSDConfig.\nMaking new model and optimizer.') start_epoch = 0 model = SSD(config) model_parameters = utils.get_model_params(model) optimizer = SGD(params=[{'params': model_parameters['biases'], 'lr': 2 * config.LEARNING_RATE}, {'params': model_parameters['not_biases']}], lr=config.LEARNING_RATE, momentum=config.MOMENTUM, weight_decay=config.WEIGHT_DECAY) # dataloader df = get_dataframe(config.PATH_TO_ANNOTATIONS) dataset = ShelfImageDataset(df, config.PATH_TO_IMAGES, train=True) dataloader = DataLoader(dataset, shuffle=True, collate_fn=collate_fn, batch_size=config.TRAIN_BATCH_SIZE, num_workers=config.NUM_DATALOADER_WORKERS) # move to device model.to(device) criterion = MultiBoxLoss(model.priors_cxcy, config).to(device) # num epochs to train epochs = config.NUM_ITERATIONS_TRAIN // len(dataloader) # epoch where LR is decayed decay_at_epoch = [int(epochs*x) for x in config.DECAY_LR_AT] # fooh!!!! :) for epoch in range(start_epoch, epochs): if epoch in decay_at_epoch: utils.adjust_learning_rate(optimizer, config.DECAY_FRAC) train(dataloader, model, criterion, optimizer, epoch) utils.save_checkpoint(epoch, model, optimizer, config, config.PATH_TO_CHECKPOINT)
def initialize_net() -> None: global ssd_net # if already defined, return it if ssd_net is not None: print('use cached ssd_net') return ssd_net # get device ( cpu / gpu ) to be used use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") print(f'device : {device}') ssd_cfg = { 'num_classes': num_classes, # number of classes including background class 'input_size': Parameters.IMG_SIZE, 'bbox_aspect_num': Parameters.BBOX_ASPECT_NUM, 'feature_maps': Parameters.FEATURE_MAPS, 'steps': Parameters.STEPS, 'min_sizes': Parameters.MIN_SIZES, 'max_sizes': Parameters.MAX_SIZES, 'aspect_ratios': Parameters.ASPECT_RATIOS, 'conf_thresh': Parameters.CONF_THRESHOLD, 'top_k': Parameters.TOP_K, 'nms_thresh': Parameters.NMS_THRESHOLD } print(f'initializing ssd with : {ssd_cfg}') ssd_net = SSD(phase="inference", cfg=ssd_cfg) # load weight created in training weight_file_path = os.path.join(Parameters.ABEJA_TRAINING_RESULT_DIR, 'model.pth') print(f'weight_file_path : {weight_file_path}') # cf. https://pytorch.org/tutorials/beginner/saving_loading_models.html#save-on-gpu-load-on-gpu weight = torch.load(weight_file_path, map_location=device) ssd_net.load_state_dict(weight) ssd_net = ssd_net.to(device) ssd_net.eval() return ssd_net
def handler(context): print( f'start training with parameters : {Parameters.as_dict()}, context : {context}' ) try: dataset_alias = context.datasets # for older version except AttributeError: dataset_alias = context['datasets'] train_dataset_id, val_dataset_id = get_dataset_ids(dataset_alias) id2index, _ = set_categories(list(dataset_alias.values())) num_classes = len(id2index) num_classes += 1 # add for background class print(f'number of classes : {num_classes}') print("Start downloading datasets.") dataset_items = list( load_dataset_from_api(train_dataset_id, max_num=Parameters.MAX_ITEMS)) print("Finish downloading datasets.") random.shuffle(dataset_items) if val_dataset_id is not None: val_dataset_items = list( load_dataset_from_api(val_dataset_id, max_num=Parameters.MAX_ITEMS)) random.shuffle(val_dataset_items) train_dataset_items = dataset_items else: test_size = int(len(dataset_items) * Parameters.TEST_SIZE) train_dataset_items, val_dataset_items = dataset_items[ test_size:], dataset_items[:test_size] train_dataset = ABEJAPlatformDataset(train_dataset_items, phase="train", transform=DataTransform( Parameters.IMG_SIZE, Parameters.MEANS)) val_dataset = ABEJAPlatformDataset(val_dataset_items, phase="val", transform=DataTransform( Parameters.IMG_SIZE, Parameters.MEANS)) print(f'train dataset : {len(train_dataset)}') print(f'val dataset : {len(val_dataset)}') train_dataloader = data.DataLoader(train_dataset, batch_size=Parameters.BATCH_SIZE, shuffle=Parameters.SHUFFLE, collate_fn=od_collate_fn) val_dataloader = data.DataLoader(val_dataset, batch_size=Parameters.BATCH_SIZE, shuffle=False, collate_fn=od_collate_fn) dataloaders_dict = {"train": train_dataloader, "val": val_dataloader} print(f'data loaders : {dataloaders_dict}') ssd_cfg = { 'num_classes': num_classes, # number of classes including background class 'input_size': Parameters.IMG_SIZE, 'bbox_aspect_num': Parameters.BBOX_ASPECT_NUM, 'feature_maps': Parameters.FEATURE_MAPS, 'steps': Parameters.STEPS, 'min_sizes': Parameters.MIN_SIZES, 'max_sizes': Parameters.MAX_SIZES, 'aspect_ratios': Parameters.ASPECT_RATIOS, 'conf_thresh': Parameters.CONF_THRESHOLD, 'top_k': Parameters.TOP_K, 'nms_thresh': Parameters.NMS_THRESHOLD } net = SSD(phase="train", cfg=ssd_cfg) # TODO: better to host this file by ourselves # https://github.com/amdegroot/ssd.pytorch#training-ssd url = 'https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth' weight_file = os.path.join(Parameters.ABEJA_TRAINING_RESULT_DIR, 'vgg16_reducedfc.pth') download(url, weight_file) vgg_weights = torch.load(weight_file) print('finish loading base network...') net.vgg.load_state_dict(vgg_weights) def weights_init(m): if isinstance(m, nn.Conv2d): init.kaiming_normal_(m.weight.data) if m.bias is not None: # in case of bias nn.init.constant_(m.bias, 0.0) # apply initial values of He net.extras.apply(weights_init) net.loc.apply(weights_init) net.conf.apply(weights_init) # configure loss function criterion = MultiBoxLoss(jaccard_thresh=Parameters.OVERLAP_THRESHOLD, neg_pos=Parameters.NEG_POS, device=device) # configure optimizer optimizer = optim.SGD(net.parameters(), lr=Parameters.LR, momentum=Parameters.MOMENTUM, dampening=Parameters.DAMPENING, weight_decay=Parameters.WEIGHT_DECAY, nesterov=Parameters.NESTEROV) # move network to device net.to(device) # NOTE: This flag allows to enable the inbuilt cudnn auto-tuner # to find the best algorithm to use for your hardware. # cf. https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936/2 torch.backends.cudnn.benchmark = True iteration = 1 epoch_train_loss = 0.0 epoch_val_loss = 0.0 latest_epoch_train_loss = epoch_train_loss latest_epoch_val_loss = epoch_val_loss for epoch in range(Parameters.EPOCHS): t_epoch_start = time.time() t_iter_start = time.time() print('-------------') print('Epoch {}/{}'.format(epoch + 1, Parameters.EPOCHS)) print('-------------') # loop of train and validation for each epoch for phase in ['train', 'val']: if phase == 'train': net.train() print('(train)') else: if (epoch + 1) % 10 == 0: net.eval() print('-------------') print('(val)') else: # perform validation once every ten times continue # loop each mini-batch from data loader for images, targets in dataloaders_dict[phase]: images = images.to(device) targets = [ann.to(device) for ann in targets] # initialize optimizer optimizer.zero_grad() # calculate forward with torch.set_grad_enabled(phase == 'train'): outputs = net(images) # calculate loss loss_l, loss_c = criterion(outputs, targets) loss = loss_l + loss_c if phase == 'train': # back propagate when training loss.backward() # calculate gradient nn.utils.clip_grad_value_( net.parameters(), clip_value=Parameters.CLIP_VALUE) optimizer.step() # update parameters if iteration % 10 == 0: # display loss once every ten iterations t_iter_finish = time.time() duration = t_iter_finish - t_iter_start print( 'iter {} || Loss: {:.4f} || 10iter: {:.4f} sec.' .format(iteration, loss.item(), duration)) t_iter_start = time.time() epoch_train_loss += loss.item() iteration += 1 else: epoch_val_loss += loss.item() # loss and accuracy rate of each phase of epoch t_epoch_finish = time.time() # keep latest epoch loss if epoch_train_loss != 0.0: num_total = len(dataloaders_dict['train']) latest_epoch_train_loss = epoch_train_loss / num_total if epoch_val_loss != 0.0: num_total = len(dataloaders_dict['val']) latest_epoch_val_loss = epoch_val_loss / num_total print('-------------') print('epoch {} || Epoch_TRAIN_Loss:{:.4f} || Epoch_VAL_Loss:{:.4f}'. format(epoch + 1, latest_epoch_train_loss, latest_epoch_val_loss)) print('timer: {:.4f} sec.'.format(t_epoch_finish - t_epoch_start)) t_epoch_start = time.time() statistics(epoch + 1, latest_epoch_train_loss, None, latest_epoch_val_loss, None) writer.add_scalar('main/loss', latest_epoch_train_loss, epoch + 1) if (epoch + 1) % 10 == 0: writer.add_scalar('test/loss', latest_epoch_val_loss, epoch + 1) model_path = os.path.join(Parameters.ABEJA_TRAINING_RESULT_DIR, f'ssd300_{str(epoch + 1)}.pth') torch.save(net.state_dict(), model_path) writer.flush() epoch_train_loss = 0.0 epoch_val_loss = 0.0 torch.save(net.state_dict(), os.path.join(Parameters.ABEJA_TRAINING_RESULT_DIR, 'model.pth')) writer.close()