def main(): input_args = train_input() print_model(input_args) device = torch.device("cuda:0" if torch.cuda.is_available() and input_args.gpu == True else "cpu") model = create_model(input_args.arch, input_args.hidden_units) criterion = nn.NLLLoss() optimizer = optim.Adam(model.classifier.parameters(), input_args.learning_rate,) exp_lr_scheduler = lr_scheduler.StepLR( optimizer, step_size=5, gamma=0.1) image_datasets, dataloaders = create_dataloaders( input_args.data_dir) train(model, dataloaders, image_datasets, criterion, optimizer, exp_lr_scheduler, device, input_args.epochs) if input_args.save_dir: model.cpu() save_checkpoint({ 'epoch': input_args.epochs, 'arch': input_args.arch, 'classifier': model.classifier, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'mapping': image_datasets['train'].class_to_idx }, input_args.save_dir)
def doCheckpoint(in_args, model, output_size): """create a checkpoint and save_dir PARAMETERS: in_args: argparse.Parser object model: PyTorch model output_size: int RETURNS: None """ checkpoint = { 'epochs': in_args.epochs, 'input_size': model.classifier[0].in_features, 'output_size': output_size, 'hidden_size': in_args.hidden_units, 'arch': in_args.arch, 'class_to_idx': model.class_to_idx, } savefile = os.path.join(in_args.save_dir, 'checkpoint.pth') mutils.save_checkpoint(checkpoint, model, filename=savefile) sys.stdout.write("Checkpoint Saved to %s\n" % savefile)
def train(): conf = Config() # 打印模型配置信息 conf.dump() parser = argparse.ArgumentParser(description='图片分类模型训练') parser.add_argument( '--resume_checkpoint', action='store', type=str, default='model/checkpoint.pth', help='从模型的checkpoint恢复模型,并继续训练,如果resume_checkpoint这个参数提供' '这些参数将忽略--arch, --learning_rate, --hidden_units, and --drop_p') args = parser.parse_args() #加载数据 dataloaders, class_to_idx = load_data(conf.data_directory) #创建模型,如果模型文件存在 if args.resume_checkpoint and os.path.exists(args.resume_checkpoint): #加载checkpoint print('resume_checkpoint已存在,开始加载模型') model, optimizer, epoch, history = load_checkpoint( checkpoint_path=args.resume_checkpoint, load_optimizer=True, gpu=conf.cuda) start_epoch = epoch + 1 else: #创建新模型和优化器 print('resume_checkpoint未设置或模型文件不存在,创建新的模型') model = create_model( arch=conf.arch, class_to_idx=class_to_idx, hidden_units=conf.hidden_units, drop_p=conf.dropout) optimizer = create_optimizer(model=model, lr=conf.learning_rate) start_epoch = 1 history = None #训练模型 history, best_epoch = train_model( dataloaders=dataloaders, model=model, optimizer=optimizer, gpu=conf.cuda, start_epoch=start_epoch, epochs=conf.epochs, train_history=history) #测试集上测试模型 test_acc = test_model(dataloader=dataloaders['test'], model=model, gpu=conf.cuda) print(f'模型在测试集上的准确率是 {(test_acc * 100):.2f}%') #保存模型 save_checkpoint( save_path=conf.save_path+conf.save_name, epoch=best_epoch, model=model, optimizer=optimizer, history=history) #绘制历史记录 plot_history(history)
def train(opt, resnet): total_step = len(train_loader) starttime = time.time() print_idx = 100 best_loss = 100 print_loss = 100 for epoch in range(opt.start_epoch, opt.num_epochs): for step, (img1, img2, target) in enumerate(train_loader): size = img1.size()[0] x_t1 = img1.to(opt.device) x_t2 = img2.to(opt.device) if step % print_idx == 0: print("Epoch [{}/{}], Step [{}/{}], Time (s): {:.1f}".format( epoch + 1, opt.num_epochs, step, total_step, time.time() - starttime, )) _, out_1 = resnet(x_t1) _, out_2 = resnet(x_t2) loss = contrast_loss(out_1, out_2, size, opt) resnet.zero_grad() loss.backward() optimizer.step() print_loss = loss.item() print("Epoch:[{}/{}]\t \t Loss: \t \t {:.4f}".format( epoch + 1, opt.num_epochs, print_loss)) model_utils.logfile(opt, epoch, print_loss, optimizer.param_groups[0]['lr']) model_utils.save_checkpoint(opt, resnet, epoch, print_loss, best_loss) if print_loss < best_loss: best_loss = print_loss
context_ids_tens = torch.LongTensor(context_ids).to(args.device) neg_ids = vocab.neg_sample(size=context_ids.shape) if args.section2vec: neg_ids[:, 0] = np.random.choice(section_id_range, size=[args.batch_size]) neg_ids_tens = torch.LongTensor(neg_ids).to(args.device) kl_loss, recon_loss = model(center_ids_tens, context_ids_tens, neg_ids_tens, num_contexts) joint_loss = kl_loss + recon_loss joint_loss.backward() # backpropagate loss epoch_kl_loss += kl_loss.item() epoch_recon_loss += recon_loss.item() epoch_joint_loss += joint_loss.item() optimizer.step() epoch_joint_loss /= float(batcher.num_batches()) epoch_kl_loss /= float(batcher.num_batches()) epoch_recon_loss /= float(batcher.num_batches()) sleep(0.1) print('Epoch={}. Joint loss={}. KL Loss={}. Reconstruction Loss={}'.format( epoch, epoch_joint_loss, epoch_kl_loss, epoch_recon_loss)) assert not batcher.has_next() # Serializing everything from model weights and optimizer state, to to loss function and arguments losses_dict = {'losses': {'joint': epoch_joint_loss, 'kl': epoch_kl_loss, 'recon': epoch_recon_loss}} checkpoint_fp = os.path.join(weights_dir, 'checkpoint_{}.pth'.format(epoch)) save_checkpoint(args, model, optimizer, vocab, losses_dict, checkpoint_fp=checkpoint_fp) # Run evaluations evaluate(args)
def save(self, alias, epoch_id, hit_ratio, ndcg): assert hasattr(self, 'model'), 'Please specify the exact model !' model_dir = self.config['model_dir'].format(alias, epoch_id, hit_ratio, ndcg) save_checkpoint(self.model, model_dir)
help= f'Sizes of hidden layers in model classifier. Can pass multiple arguments. Default: {" ".join([str(_) for _ in def_hidden_units])}.' ) parser.add_argument( '--output_units', nargs='?', default=def_output_units, type=int, help= f'Size of output layer, or number of prediction classes. Default is {def_output_units}.' ) parser.add_argument( '--epochs', nargs='?', default=def_epochs, type=int, help=f'Number of training epochs to run. Default is {def_epochs}.') parser.add_argument('--gpu', action='store_true', help='Pass this flag to use GPU if available.') args = parser.parse_args() print(args) loaders = build_data_loaders(args.data_dir) model = build_model(args.arch, args.hidden_units, args.output_units) best_model = train(model, args.epochs, args.learning_rate, args.gpu, loaders) now = datetime.datetime.strftime(datetime.datetime.now(), '%Y%m%dT%H%M%S') save_checkpoint(f'{args.save_dir}/checkpoint-{args.arch}-{now}.pth', best_model, args.arch)
def acronyms_finetune(args): args.git_hash = get_git_revision_hash() render_args(args) prev_args, bsg_model, vocab, _ = restore_model(args.bsg_experiment) # Load Data data_dir = '../eval/eval_data/minnesota/' sense_fp = os.path.join(data_dir, 'sense_inventory_ii') lfs, lf_sf_map, sf_lf_map = parse_sense_df(sense_fp) df = pd.read_csv(os.path.join(data_dir, 'preprocessed_dataset_window_{}.csv'.format(prev_args.window))) df['target_lf_idx'] = df['sf'].combine(df['target_lf'], lambda sf, lf: target_lf_index(lf, sf_lf_map[sf])) prev_N = df.shape[0] df = df[df['target_lf_idx'] > -1] print('Removed {} examples for which the target LF is not exactly in the sense inventory ii'.format( prev_N - df.shape[0])) sfs = df['sf'].unique().tolist() used_sf_lf_map = defaultdict(list) dominant_sfs = set() for sf in sfs: subset_df = df[df['sf'] == sf] used_target_idxs = subset_df['target_lf_idx'].unique() if len(used_target_idxs) == 1: dominant_sfs.add(sf) else: for lf_idx in used_target_idxs: used_sf_lf_map[sf].append(sf_lf_map[sf][lf_idx]) prev_N = df.shape[0] df = df[~df['sf'].isin(dominant_sfs)] print(('Removing {} examples from {} SF\'s because they have only 1 sense associated with' ' them after preprocessing'.format(prev_N - df.shape[0], len(dominant_sfs)))) df['used_target_lf_idx'] = df['sf'].combine(df['target_lf'], lambda sf, lf: target_lf_index(lf, used_sf_lf_map[sf])) sf_tokenized_lf_map = {} for sf, lf_list in used_sf_lf_map.items(): sf_tokenized_lf_map[sf] = list(map(lf_tokenizer, lf_list)) train_df, test_df = train_test_split(df, random_state=1992, test_size=0.2) train_batcher = AcronymBatcherLoader(train_df, batch_size=args.batch_size) test_batcher = AcronymBatcherLoader(test_df, batch_size=args.batch_size) render_test_statistics(test_df, used_sf_lf_map) # Create model experiments directory or clear if it already exists weights_dir = os.path.join('../acronyms', 'weights', args.experiment) if os.path.exists(weights_dir): print('Clearing out previous weights in {}'.format(weights_dir)) rmtree(weights_dir) os.mkdir(weights_dir) results_dir = os.path.join('../acronyms', weights_dir, 'results') os.mkdir(results_dir) os.mkdir(os.path.join(results_dir, 'confusion')) model = AcronymExpander(bsg_model) # Instantiate Adam optimizer trainable_params = filter(lambda x: x.requires_grad, model.parameters()) optimizer = torch.optim.Adam(trainable_params, lr=args.lr) loss_func = nn.CrossEntropyLoss() best_weights = None best_epoch = 1 lowest_test_loss = run_test_epoch(args, test_batcher, model, loss_func, vocab, sf_tokenized_lf_map) # Make sure it's calculating gradients model.train() # just sets .requires_grad = True for epoch in range(1, args.epochs + 1): sleep(0.1) # Make sure logging is synchronous with tqdm progress bar print('Starting Epoch={}'.format(epoch)) train_loss = run_train_epoch(args, train_batcher, model, loss_func, optimizer, vocab, sf_tokenized_lf_map) test_loss = run_test_epoch(args, test_batcher, model, loss_func, vocab, sf_tokenized_lf_map) losses_dict = { 'train': train_loss, 'test_loss': test_loss } checkpoint_fp = os.path.join(weights_dir, 'checkpoint_{}.pth'.format(epoch)) save_checkpoint(args, model, optimizer, vocab, losses_dict, checkpoint_fp=checkpoint_fp) lowest_test_loss = min(lowest_test_loss, test_loss) best_weights = model.state_dict() if lowest_test_loss == test_loss: best_epoch = epoch if args.debug: break print('Loading weights from {} epoch to perform error analysis'.format(best_epoch)) model.load_state_dict(best_weights) losses_dict['test_loss'] = lowest_test_loss checkpoint_fp = os.path.join(weights_dir, 'checkpoint_best.pth') save_checkpoint(args, model, optimizer, vocab, losses_dict, checkpoint_fp=checkpoint_fp) error_analysis(test_batcher, model, used_sf_lf_map, loss_func, vocab, results_dir=results_dir)
inputs, labels = inputs.to(device), labels.to(device) optimizer.zero_grad() # Forward and backward passes outputs = model.forward(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.item() if steps % print_every == 0: model.eval() # This moves your model to evaluation mode with torch.no_grad( ): #This turns off gradients during evaluation which is faster and saves memory validation_loss, accuracy = validation(model, validloader, criterion) print( "Epoch: {}/{}... ".format(e + 1, epochs), "Training Loss: {:.4f}".format(running_loss / print_every), "Validation Loss: {:.4f}".format(validation_loss / print_every), "Validation accuracy: {:.4f}".format(accuracy)) running_loss = 0 # save checkpoint save_checkpoint('checkpoint4.pth', model_type, model, learning_rate, hidden_units)
def eval_epoch(self): self.model.eval() test_idxs = np.arange(0, len(self.valid_dataset)) num_batches = len(self.valid_dataset) // self.val_batch_size # To collect statistics total_correct = 0 total_seen = 0 loss_sum = 0 total_seen_class = [0 for _ in range(self.config.NUM_CLASSES)] total_correct_class = [0 for _ in range(self.config.NUM_CLASSES)] iou2ds_sum = 0 iou3ds_sum = 0 iou3d_correct_cnt = 0 # Simple evaluation with batches for batch_idx in range(num_batches): start_idx = batch_idx * self.val_batch_size end_idx = (batch_idx + 1) * self.val_batch_size batch_data, batch_label, batch_center, \ batch_hclass, batch_hres, \ batch_sclass, batch_sres, \ batch_rot_angle, batch_one_hot_vec = \ tuple(get_batch(self.valid_dataset, test_idxs, start_idx, end_idx, self.config.NUM_POINT, self.config.NUM_CHANNELS)) with torch.no_grad(): self.endpoints = self.model(batch_data, batch_one_hot_vec) val_loss = self.loss(batch_label, batch_center, batch_hclass, batch_hres, batch_sclass, batch_sres) preds_val = np.argmax( self.endpoints['mask_logits'].detach().cpu().numpy(), 2) correct = np.sum(preds_val == batch_label.detach().cpu().numpy()) total_correct += correct total_seen += (self.val_batch_size * self.config.NUM_POINT) loss_sum += val_loss iou2ds, iou3ds = compute_box3d_iou( self.endpoints['center'].detach().cpu().numpy(), self.endpoints['heading_scores'].detach().cpu().numpy(), self.endpoints['heading_residuals'].detach().cpu().numpy(), self.endpoints['size_scores'].detach().cpu().numpy(), self.endpoints['size_residuals'].detach().cpu().numpy(), batch_center.detach().cpu().numpy(), batch_hclass.detach().cpu().numpy(), batch_hres.detach().cpu().numpy(), batch_sclass.detach().cpu().numpy(), batch_sres.detach().cpu().numpy()) self.endpoints['iou2ds'] = iou2ds self.endpoints['iou3ds'] = iou3ds iou2ds_sum += np.sum(self.endpoints['iou2ds']) iou3ds_sum += np.sum(self.endpoints['iou3ds']) iou3d_correct_cnt += np.sum(self.endpoints['iou3ds'] >= 0.7) for l in range(self.config.NUM_CLASSES): total_seen_class[l] += np.sum( batch_label.detach().cpu().numpy() == l) total_correct_class[l] += ( np.sum((preds_val == l) & (batch_label.detach().cpu().numpy() == l))) seg_acc = (total_correct / float(total_seen)) iou_ground = iou2ds_sum / float(self.val_batch_size * num_batches) iou_3d = iou3ds_sum / float(self.val_batch_size * num_batches) box_acc = float(iou3d_correct_cnt) / float( self.val_batch_size * num_batches) self.log_values(batch_idx, loss_sum / float(num_batches), seg_acc, iou_ground, iou_3d, box_acc, 'Val') if self.best_val_loss > (loss_sum / float(num_batches)): self.best_val_loss = (loss_sum / float(num_batches)) self.best_model = self.model save_checkpoint('./models/best_model.pth', self.model, self.epoch, self.optimizer, self.best_val_loss)
start_epoch = 1 history = None # Train model ########################### history, best_epoch = train_model(dataloaders=dataloaders, model=model, optimizer=optimizer, gpu=gpu, start_epoch=start_epoch, epochs=args.epochs, train_history=history) # Check performance on test data set # test_acc = test_model( # dataloader=dataloaders['test'], model=model, gpu=gpu) # print(f'\nModel achieved accuracy of {(test_acc * 100):.2f}% on Test data set.') # Plot training history plot_history(history) # NOTE: plot_history() is currently not working on Udacity workspace because # display device is not available # Save checkpoint ########################### save_checkpoint(save_path=args.save_path, epoch=best_epoch, model=model, optimizer=optimizer, history=history)
#constants #output_cats = 102 # number of flower classifications (can make this a command line input for other training) args = get_args_train() if (args.device == 'gpu' and torch.cuda.is_available()): device = torch.device('cuda') else: print( "Model should be trained on GPU, enable and select --gpu gpu for training" ) train_data, test_data, validation_data, trainloader, testloader, validationloader = load_data( args.data_directory) pretrain_model, arch_inFeatures = pretrained_model(args.arch) model, criterion = create_classifier(pretrain_model, arch_inFeatures, args.hidden_units, args.output_cats) optimizer = optim.Adam(model.classifier.parameters(), lr=args.lr) trained_model = train_model(model, args.epochs, trainloader, validationloader, device, optimizer, criterion) tested_model = test_model(trained_model, testloader, device, optimizer, criterion) save_checkpoint(trained_model, args.save_directory, args.arch, train_data, optimizer, args.epochs, args.hidden_units)