def main(args): # get arguments rate_num = args.rate_num use_side_feature = args.use_side_feature lr = args.lr weight_decay = args.weight_decay num_epochs = args.num_epochs hidden_dim = args.hidden_dim side_hidden_dim = args.side_hidden_dim out_dim = args.out_dim drop_out = args.drop_out split_ratio = args.split_ratio save_steps = args.save_steps log_dir = args.log_dir saved_model_folder = args.saved_model_folder use_data_whitening = args.use_data_whitening use_laplacian_loss = args.use_laplacian_loss laplacian_loss_weight = args.laplacian_loss_weight # mark and record the training file, save the training arguments for future analysis post_fix = '/' + time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) log_dir = log_dir + post_fix writer = SummaryWriter(log_dir=log_dir) f = open(log_dir + '/test.txt', 'a') f.write(str(vars(args))) f.close() print(log_dir) #get prepared data feature_u, feature_v, feature_dim, all_M_u, all_M_v, side_feature_u, side_feature_v, all_M, mask, user_item_matrix_train, user_item_matrix_test, laplacian_u, laplacian_v = prepare( args) if not os.path.exists(saved_model_folder): os.makedirs(saved_model_folder) weights_name = saved_model_folder + post_fix + '_weights' net = utils.create_models(feature_u, feature_v, feature_dim, hidden_dim, rate_num, all_M_u, all_M_v, side_hidden_dim, side_feature_u, side_feature_v, use_side_feature, out_dim, drop_out) net.train() # in train mode # create AMSGrad optimizer optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=weight_decay) Loss = utils.loss(all_M, mask, user_item_matrix_train, laplacian_loss_weight) iter_bar = tqdm(range(num_epochs), desc='Iter (loss=X.XXX)') for epoch in iter_bar: optimizer.zero_grad() score = net.forward() if use_laplacian_loss: loss = Loss.laplacian_loss(score, laplacian_u, laplacian_v) else: loss = Loss.loss(score) loss.backward() optimizer.step() with torch.no_grad(): rmse = Loss.rmse(score) val_rmse = validate(score, rate_num, user_item_matrix_test) iter_bar.set_description( 'Iter (loss=%5.3f, rmse=%5.3f, val_rmse=%5.5f)' % (loss.item(), rmse.item(), val_rmse.item())) # writer.add_scalars('scalar',{'loss': loss.item(), 'rmse': rmse.item(), 'val_rmse':val_rmse.item(),},epoch) writer.add_scalars('scalar', {'loss': loss.item()}, epoch) if epoch % save_steps == 0: torch.save(net.state_dict(), weights_name) rmse = Loss.rmse(score) print('Final training RMSE: ', rmse.data.item()) torch.save(net.state_dict(), weights_name) sm = nn.Softmax(dim=0) score = sm(score) score_list = torch.split(score, rate_num) pred = 0 for i in range(rate_num): pred += (i + 1) * score_list[0][i] pred = utils.var_to_np(pred) # pred = np.load('./prediction.npy') ### test the performance # user_item_matrix_test = np.load('./processed_dataset/user_item_matrix_test.npy') test_mask = user_item_matrix_test > 0 square_err = (pred * test_mask - user_item_matrix_test)**2 mse = square_err.sum() / test_mask.sum() test_rmse = np.sqrt(mse) print('Test RMSE: ', test_rmse)
# -*- coding: utf-8 -*- """ File creating the final predictions """ import utils labels = [0, 1, 2] # parameters of the classifier params = {0: ['KS_7', 0.01], 1: ['KS_4', 0.005], 2: ['KS_5', 0.002]} utils.create_models(labels, params)
def test_net(model_choice, resize, image_size, TTA, ensemble, test_set_output, test_with_labels, only_test_single, test_image_name, test_root, validate_root, num_test=50): ''' Model test, which includes three different tests: 1. If test_set_output = 1, we output the prediction masks of all test images in directory ./output. A submission file is also an output, as required in the competition. 2. If test_with_labels = 1, we test all the images in the dataset and print the F1 and average loss. 3. If only_test_single = 1, we only test a single image, i.e. pass it to the network. It also outputs the original image coverred by the prediction mask, saved as test.png. @model_choice: 1 for LinkNet, 2 for D-LinkNet, 3 for D-LinkNet+. @resize: boolean flag for image resizing. @image_size: the image size for the images to trained. @TTA: boolean flag for test time augmentation. @ensemble: boolean flag to enable ensemble when testing @test_set_output: boolean flag for testing all the images in the test dataset. @test_with_labels: boolean flag for testing on a validation dataset, with labels provided. @only_test_single: boolean flag for testing a single image. @test_image_name: the name of the image to be tested. @test_root: root directory for test dataset. @validate_root: root directory for validation dataset. @num_test: number of test images in the test dataset. ''' net = utils.create_models(model_choice) linkNet = None DlinkNet = None weights_name = './parameters/weights' + str(model_choice) # net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count())) if RUN_ON_GPU: net.load_state_dict(torch.load(weights_name)) else: net.load_state_dict( torch.load(weights_name, map_location=lambda storage, loc: storage)) net.eval() if ensemble: linkNet = utils.create_models(0) DlinkNet = utils.create_models(1) if RUN_ON_GPU: linkNet.load_state_dict(torch.load('./parameters/weights0')) DlinkNet.load_state_dict(torch.load('./parameters/weights1')) else: linkNet.load_state_dict( torch.load('./parameters/weights0', map_location=lambda storage, loc: storage)) DlinkNet.load_state_dict( torch.load('./parameters/weights1', map_location=lambda storage, loc: storage)) linkNet.eval() DlinkNet.eval() if test_with_labels: loss, f1 = test.test_batch_with_labels(net, validate_root, resize=resize, batch_size=1, image_size=image_size, smooth=1.0, lam=1.0) print('F1 is evaluated as ', f1) print('Average batch loss is ', loss) if only_test_single: if ensemble: mask, image = test.test_single_with_ensemble(linkNet, DlinkNet, net, test_image_name, size=image_size, resize=resize) elif TTA: mask, image = test.test_single_with_TTA(net, test_image_name, size=image_size, resize=resize) else: mask, image = test.test_single_image(net, test_image_name, size=image_size, resize=resize) io.imshow(image) io.imsave('test.png', image) if test_set_output: if not os.path.exists('./output'): os.makedirs('./output') for i in range(1, num_test + 1): t = 'test_' + str(i) name = test_root + t + '/' + t + '.png' if ensemble: mask, image = test.test_single_with_ensemble(linkNet, DlinkNet, net, name, size=image_size, resize=resize) elif TTA: mask, image = test.test_single_with_TTA(net, name, size=image_size, resize=resize) else: mask, image = test.test_single_image(net, name, size=image_size, resize=resize) io.imsave('./output/' + 'test' + str(i) + '.png', mask) submission_filename = 'submission.csv' image_filenames = [] for i in range(1, num_test + 1): image_filename = 'output/test' + str(i) + '.png' print(image_filename) image_filenames.append(image_filename) mask_to_submission.masks_to_submission(submission_filename, *image_filenames)
def train_net(root, resize, data_augment, rotate, change_color, lr, weight_decay, model_choice, save_ckpt, image_size, batch_size, num_epochs, save_test_image, test_image_name, early_stop, early_stop_tol, lr_decay, decay_rate, decay_period, validate_root, loss_type='bce', smooth=1.0, lam=1.0, gamma=2.0): ''' Network training, which will output: 1. log for loss in every iteration, in text file. 2. saved checkpoint which contains the trained parameters, in directory ./parameters 3. segmentation result on the test image, saved in directory ./epoch_output. Parameters: @root: root directory for training dataset. @resize: boolean flag for image resizing. @data_augment: boolean flag for DA8 (randomly rotate 90 degrees, flip horizontally and vertically). @rotate: boolean flag for random rotation to the training images. @change_color: boolean flag for random perturbation on HSV channels of the training images. @lr: learning rate. @weight_decay: weight decay for L2 regularization on the network parameters. @model_choice: 1 for LinkNet, 2 for D-LinkNet, 3 for D-LinkNet+. @save_ckpt: the period (in epochs) to save the checkpoint of the network. @image_size: the image size for the images to trained. @batch_size: batch size for mini-batch stochastic gradient descent. @num_epochs: number of epochs for training. @save_test_image: the period (in epochs) to save the prediction of the test image. @test_image_name: the name of the test image. @early_stop: the boolean flag to have early stop. @early_stop_tol: the tolerance (in number of saving checkpoints) to trigger early stop. @lr_decay: boolean flag for learning rate decay in every decay period. @decay_rate: decay ratio for learning rate, e.g. lr = lr * lr_decay. @decay_period: the period in number of epochs to trigger the learning rate decay. @validate_root: root directory for validation dataset (mainly for evaluation of network during training). @loss_type: either 'bce' (BCE loss) or 'focal' (focal loss). @smooth: number to be added on denominator and numerator when compute dice loss. @lam: weight to balance the dice loss in the final combined loss. @gamma: for focal loss. ''' if os.path.exists('./epoch_output'): shutil.rmtree('./epoch_output') os.makedirs('./epoch_output') if not os.path.exists('./parameters'): os.makedirs('./parameters') weights_name = './parameters/weights' + str(model_choice) net = utils.create_models(model_choice) net.train() # in train mode # net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count())) # create AMSGrad optimizer optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=weight_decay, amsgrad=True) Loss = utils.loss(smooth, lam, gamma, loss_type) dataloader = utils.get_data_loader(root, resize, data_augment, image_size, batch_size, rotate, change_color) num_batch = len(dataloader) total_train_iters = num_epochs * num_batch loss_history = [] print('Started training at {}'.format( time.asctime(time.localtime(time.time())))) test_loss = 100.0 count = 0 for epoch in range(num_epochs): print('Start epoch ', epoch) epoch_loss = 0 t = time.time() for iteration, batch in enumerate(dataloader, epoch * num_batch + 1): print('Iteration: ', iteration) print('Time for loading the data takes: ', time.time() - t, ' s') t = time.time() image = utils.np_to_var(batch['image']) mask = utils.np_to_var(batch['mask']) optimizer.zero_grad() pred = net.forward(image) loss = Loss.final_loss(pred, mask) loss.backward() optimizer.step() epoch_loss += loss.data.item() # print the log info print('Iteration [{:6d}/{:6d}] | loss: {:.4f}'.format( iteration, total_train_iters, loss.data.item())) print('Time spent on back propagation: ', time.time() - t, ' s') loss_history.append(loss.data.item()) t = time.time() # save the test image for visualizing the training outcome if (epoch + 1) % save_test_image == 0: with torch.no_grad(): _, test_image = test.test_single_image(net, test_image_name, resize=False) io.imsave('./epoch_output/test_epoch' + str(epoch) + '.png', test_image) # early stop if early_stop and (epoch + 1) % save_ckpt == 0: with torch.no_grad(): loss, f1 = test.test_batch_with_labels(net, validate_root, resize=False, batch_size=10, image_size=image_size, smooth=smooth, lam=lam) print('On the validation dataset, loss: ', loss, ', F1: ', f1) if loss <= test_loss: test_loss = loss count = 0 torch.save(net.state_dict(), weights_name) elif count < early_stop_tol: count += 1 lr *= decay_rate for param_group in optimizer.param_groups: param_group['lr'] = lr print('The new loss is found to be larger than before') else: print('Reach the early stop tolerence...') print('Break the update at ', epoch, 'th epoch') break if not early_stop and (epoch + 1) % save_ckpt == 0: with torch.no_grad(): torch.save(net.state_dict(), weights_name) if lr_decay and (epoch + 1) % decay_period == 0: with torch.no_grad(): lr *= decay_rate for param_group in optimizer.param_groups: param_group['lr'] = lr epoch_loss /= num_batch print('In the epoch ', epoch, ', the average batch loss is ', epoch_loss) if not early_stop: torch.save(net.state_dict(), weights_name) # save the loss history with open('loss.txt', 'wt') as file: file.write('\n'.join(['{}'.format(loss) for loss in loss_history])) file.write('\n')
def main(args): # 获取参数 rate_num = args.rate_num use_side_feature = args.use_side_feature # using side feature or not use_GAT = args.use_GAT lr = args.lr weight_decay = args.weight_decay num_epochs = args.num_epochs hidden_dim = args.hidden_dim side_hidden_dim = args.side_hidden_dim out_dim = args.out_dim drop_out = args.drop_out split_ratio = args.split_ratio save_steps = args.save_steps saved_model_folder = args.saved_model_folder laplacian_loss_weight = args.laplacian_loss_weight post_fix = '/' + time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) # 数据预处理 feature_u, feature_v, feature_dim, all_M_u, all_M_v, side_feature_u, side_feature_v, all_M, mask,\ user_item_matrix_train, user_item_matrix_test, laplacian_u, laplacian_v = prepare(args) if not os.path.exists(saved_model_folder): os.makedirs(saved_model_folder) weights_name = saved_model_folder + post_fix + '_weights' net = utils.create_models(feature_u, feature_v, feature_dim, hidden_dim, rate_num, all_M_u, all_M_v, side_hidden_dim, side_feature_u, side_feature_v, use_side_feature, use_GAT, out_dim, user_item_matrix_train, drop_out) net.train() optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=weight_decay) Loss = utils.loss(all_M, mask, user_item_matrix_train, laplacian_loss_weight) iter_bar = tqdm(range(num_epochs), desc='Iter (loss=X.XXX)') for epoch in iter_bar: optimizer.zero_grad() score = net.forward() loss = Loss.loss(score) loss.backward() optimizer.step() with torch.no_grad(): rmse = Loss.rmse(score) val_rmse = validate(score, rate_num, user_item_matrix_test) iter_bar.set_description('Iter (loss=%5.3f, rmse=%5.3f, val_rmse=%5.5f)'%(loss.item(), rmse.item(), val_rmse.item())) if epoch % save_steps == 0: torch.save(net.state_dict(), weights_name) rmse = Loss.rmse(score) print('Final training RMSE: ', rmse.data.item()) torch.save(net.state_dict(), weights_name) sm = nn.Softmax(dim = 0) score = sm(score) score_list = torch.split(score, rate_num) pred = 0 for i in range(rate_num): pred += (i + 1) * score_list[0][i] pred = utils.var_to_np(pred) test_mask = user_item_matrix_test > 0 square_err = (pred * test_mask - user_item_matrix_test) ** 2 mse = square_err.sum() / test_mask.sum() test_rmse = np.sqrt(mse) print('Test RMSE: ', test_rmse)