embeddings_index = helper.load_word_embeddings(args.word_vectors_directory, args.word_vectors_file, dictionary.word2idx) print('number of OOV words = ', len(dictionary) - len(embeddings_index)) # ############################################################################### # # Build the model # ############################################################################### model = NSRF(dictionary, embeddings_index, args) print(model) optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), args.lr) best_loss = sys.maxsize param_dict = helper.count_parameters(model) print('number of trainable parameters = ', numpy.sum(list(param_dict.values()))) if args.cuda: model = model.cuda() if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = helper.load_checkpoint(args.resume) args.start_epoch = checkpoint['epoch'] best_loss = checkpoint['best_loss'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format(
seq_length=seq_length, device=device) # and make our data loaders # batch size is exactly 1 character by default, which is exactly what we need train_loader = DataLoader(train_data) validation_loader = DataLoader(validation_data) # Part 3: modelling # we create our model model = CharRNN(num_chars).to(device) # and the initial hidden state (a tensor of zeros) initial_state = model.init_hidden(batch_size, device) # we evaluate the capability of our model # a character to parameter ratio approaching 1 is optimal # too many parameters and the model may overfit # too few and the model may underfit char_param_ratio = len(text) / count_parameters(model) print("Character to model parameter ratio: %f\n" % char_param_ratio) # Part 4: training train(model, initial_state, train_loader=train_loader, validation_loader=validation_loader, epochs=100) # Part 5: evaluation print(sample(model, char2int))
model = LSTM(dictionary, embeddings_index, args) selector = Selector(dictionary, embeddings_index, args) print(selector) print(model) optim_fn_selector, optim_params_selector = helper.get_optimizer(args.optimizer) optimizer_selector = optim_fn_selector( filter(lambda p: p.requires_grad, selector.parameters()), **optim_params_selector) optim_fn, optim_params = helper.get_optimizer(args.optimizer) optimizer = optim_fn(filter(lambda p: p.requires_grad, model.parameters()), **optim_params) best_acc = 0 param_dict_selector = helper.count_parameters(selector) param_dict = helper.count_parameters(model) print( 'number of trainable parameters = ', numpy.sum(list(param_dict_selector.values())), numpy.sum(list(param_dict.values())), numpy.sum(list(param_dict.values())) + numpy.sum(list(param_dict_selector.values()))) if args.cuda: torch.cuda.set_device(args.gpu) selector = selector.cuda() model = model.cuda() if args.load_model == 0 or args.load_model == 2: print('loading selector')
for param in model.parameters(): param.requires_grad = False if model_to_train == "ResNet152": params_to_optimize_in_top = list(model.fc.parameters()) elif model_to_train == "nViewNet" or model_to_train == "nViewNet_resume": params_to_optimize_in_top = list(model.collapse.parameters()) + list( model.fc.parameters()) for param in params_to_optimize_in_top: param.requires_grad = True optimizer_top = optim.Adam(params_to_optimize_in_top, lr=lr_top) #lr_scheduler_top = lr_scheduler.StepLR(optimizer_top, step_size=20, gamma=0.8) lr_scheduler_top = None print("Training Top:", count_parameters(model), "Parameters") b_acc = train(model, dataloaders_top, criterion, optimizer_top, epochs_top, scheduler=lr_scheduler_top, best_acc=0) print('Finished training top, best acc {:.4f}'.format(b_acc)) # Set all parameters to train (require gradient) for param in model.parameters(): param.requires_grad = True print("Training All:", count_parameters(model), "Parameters") # Optimizer for Entire Network