def fine_tune(self): best_acc = 0.0 self.lr = 0.01 self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr, momentum=0.9, weight_decay=5e-4) print("Beginning Training for", self.epochs, " Epochs") for epoch in range(1, 41): if epoch == 10: self.lr = self.lr * 0.1 for group in self.optimizer.param_groups: group['lr'] = self.lr elif epoch == 20: self.lr = self.lr * 0.1 for group in self.optimizer.param_groups: group['lr'] = self.lr train_utils.train(self, epoch) acc, loss = train_utils.evaluate(self) acc = round(acc.item(), 4) # Save best performance model if best_acc < acc: best_model_wts = copy.deepcopy(self.model.state_dict()) best_epoch = epoch best_acc = acc best_loss = loss # Save Best model # torch.save(best_model_wts, self.checkpoint_path.format(epoch=best_epoch, acc=round(best_acc * 100, 2))) # Record Metrics self.overall_log.append( {"Experiment": self.exp_name, "Epoch": best_epoch, "Test_Acc": best_acc, "Test_Loss": best_loss}) train_utils.record_overall_metrics(self, ['Experiment', 'Epoch', "Test_Acc", "Test_Loss"])
def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") assert os.path.exists(args.weights), f"weights {args.weights} not found." # segmentation nun_classes + background num_classes = args.num_classes + 1 # VOCdevkit -> VOC2012 -> ImageSets -> Segmentation -> val.txt val_dataset = VOCSegmentation(args.data_path, year="2012", transforms=SegmentationPresetEval(520), txt_name="val.txt") num_workers = 8 val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=num_workers, pin_memory=True, collate_fn=val_dataset.collate_fn) model = fcn_resnet50(aux=args.aux, num_classes=num_classes) model.load_state_dict( torch.load(args.weights, map_location=device)['model']) model.to(device) confmat = evaluate(model, val_loader, device=device, num_classes=num_classes) print(confmat)
def train_model(self): best_acc = 0.0 print("Beginning Training for", self.epochs, " Epochs") for epoch in range(1, self.epochs + 1): if epoch == 80: self.lr = 0.01 for group in self.optimizer.param_groups: group['lr'] = self.lr elif epoch == 140: self.lr = 0.001 for group in self.optimizer.param_groups: group['lr'] = self.lr train_utils.train(self, epoch) acc, loss = train_utils.evaluate(self) acc = round(acc.item(), 4) loss = round(loss, 4) # Save best performance model if best_acc < acc: best_model_wts = copy.deepcopy(self.model.state_dict()) best_epoch = epoch best_acc = acc best_loss = loss # Save Best model # torch.save(best_model_wts, self.model_path.format(task=self.task, epoch=best_epoch, acc=round(best_acc * 100, 2))) # Record Metrics train_utils.record_metrics(self) self.overall_log.append( {"Task": self.task, "Epoch": best_epoch, "Test_Acc": round(best_acc * 100, 2), "Test_Loss": best_loss}) train_utils.record_overall_metrics(self)
def train_model(self): best_acc = 0.0 print("Beginning Training for", self.epochs, " Epochs") for epoch in range(1, self.epochs + 1): if epoch == 80: self.lr = 0.01 for group in self.optimizer.param_groups: group['lr'] = self.lr elif epoch == 140: self.lr = 0.001 for group in self.optimizer.param_groups: group['lr'] = self.lr train_utils.train(self, epoch) acc, loss = train_utils.evaluate(self) # acc = round(acc.item(), 4) # Save best performance model if best_acc < acc: best_model_wts = copy.deepcopy(self.model.state_dict()) best_epoch = epoch best_acc = acc best_loss = loss print(f"Saving best model: Loss={best_loss}, Acc={best_acc}, Ep={best_epoch}") # Save Best model torch.save(best_model_wts, self.checkpoint_path.format(epoch=best_epoch, acc=best_acc)) # Record Metrics self.overall_log.append( {"Experiment": self.exp_name, "Epoch": best_epoch, "Test_Acc": round(best_acc * 100, 2), "Test_Loss": best_loss}) train_utils.record_overall_metrics(self, ['Experiment', 'Epoch', "Test_Acc", "Test_Loss"])
def fit(): ( train_img, test_img, train_labels, test_labels, train_orig_labels, test_orig_targets, ) = model_selection.train_test_split(IMAGES, LABELS_ENCODED, LABELS_NAMES, test_size=0.1, random_state=2020) train_dataset = OcrDataset(image_path=train_img, labels=train_labels, resize=(IMAGE_HEIGHT, IMAGE_WIDTH)) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=True) test_dataset = OcrDataset(image_path=test_img, labels=test_labels, resize=(IMAGE_HEIGHT, IMAGE_WIDTH)) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=False) model = OcrModel_v0(num_characters=len(labels_encoded.classes_)) model.to(DEVICE) optimizer = torch.optim.Adam(model.parameters(), lr=3e-4) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.8, patience=2, verbose=True) for epoch in range(EPOCHS): train_loss = train(model, train_loader, optimizer) valid_preds, valid_loss = evaluate(model, test_loader) valid_final_preds = [] for pred in valid_preds: # print(pred) cur_preds = decode_preds(pred, labels_encoded) valid_final_preds.extend(cur_preds) show_preds_list = list(zip(test_orig_targets, valid_final_preds))[1:3] pprint(show_preds_list) pprint("-" * 90) pprint( f"Epoch: {epoch} | Train loss = {train_loss} | Valid loss = {valid_loss} |" ) pprint("-" * 90)
def finetune_classifier(self, task, ittr="0"): print('-' * 50) print("Training task:\t", task) self.data_loaders = train_utils.CIFAR_dl_task(self, task, self.per_task_norm) best_acc = 0.0 # Setup Model model = self.backbone_model for param in model.parameters(): param.requires_grad = False model.fc = nn.Linear(512, 5) self.model = model.to(self.device) self.lr = 0.01 self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr, momentum=0.9, weight_decay=5e-4) print("Finetuning for", self.epochs, " Epochs") for epoch in range(1, self.epochs + 1): if epoch == 10: self.lr = self.lr * 0.1 for group in self.optimizer.param_groups: group['lr'] = self.lr elif epoch == 20: self.lr = self.lr * 0.1 for group in self.optimizer.param_groups: group['lr'] = self.lr train_utils.train(self, epoch) acc, loss = train_utils.evaluate(self) acc = round(acc.item(), 4) loss = round(loss, 4) # Save best performance model if best_acc < acc: best_model_wts = copy.deepcopy(self.model.state_dict()) best_acc = acc best_loss = loss best_epoch = epoch # Save Best model torch.save( best_model_wts, self.classifier_path.format(exp=ittr, task=task, epoch=best_epoch, acc=round(best_acc * 100, 2))) # Record Metrics self.classifier_results.append({ "Task": task, "Acc": round(best_acc * 100, 2), "Loss": best_loss })
def generalization_test(): lengths = np.arange(10, 101, 10) costs = {'lstm': [], 'ntm_lstm': [], 'ntm_mlp': []} # Load trained models ntm_lstm = load_model('checkpoints/ntm/copy-batch-1125.0--LSTM.model', 'NTM') ntm_mlp = load_model('checkpoints/ntm/copy-batch-7500.0--MLP.model', 'NTM') lstm, _ = load_model_v2('checkpoints/lstm/copy-batch-1000000.0.model', model_type='LSTM') # Average over 20 runs for T in tqdm_notebook(lengths): dataloader = random_binary(max_seq_length=T, num_sequences=None, batch_Size=1, min_seq_length=T - 1) cost, _, _ = evaluate(ntm_lstm, dataloader, 1, 'LSTM', False, how_many=20) costs['ntm_lstm'].append(cost) cost, _, _ = evaluate(ntm_mlp, dataloader, 1, 'MLP', False, how_many=20) costs['ntm_mlp'].append(cost) dataloader = sequence_loader(100, batch_size=1, min_length=T - 1, max_length=T) cost, _, _ = evaluate_lstm_baseline_v2(lstm, dataloader, 1, False) costs['lstm'].append(cost) return costs, lengths
def test(args): fn = 'corpus.{}.data'.format(hashlib.md5(args.data.encode()).hexdigest()) if os.path.exists(fn): print('Loading cached dataset...') corpus = torch.load(fn) print('Done') else: print('Producing dataset...') corpus = data.Corpus(args.data) torch.save(corpus, fn) print('Done') ntokens = len(corpus.dictionary) batch_size = args.batchSize val_data = batchify(corpus.valid, batch_size, args) test_data = batchify(corpus.test, batch_size, args) if not os.path.isfile(args.weightFile): print('Pre-trained weight file does not exist. Please check the location: {}'.format(args.weightFile)) exit() model, criterion, _, _ = model_load(args.weightFile) if torch.cuda.is_available(): model = model.cuda() criterion = criterion.cuda() # Run on validation data. val_loss = evaluate(args, model, criterion, val_data, ntokens, batch_size) print('=' * 89) print('| End of Validation | val loss {:5.2f} | val ppl {:8.2f}'.format( val_loss, math.exp(val_loss))) print('=' * 89) # Run on test data. test_loss = evaluate(args, model, criterion, test_data, ntokens, batch_size) print('=' * 89) print('| End of Testing | test loss {:5.2f} | test ppl {:8.2f}'.format( test_loss, math.exp(test_loss))) print('=' * 89)
def train(model, trainloader, testloader, criterion, optimizer): best_accuracy = 0.0 for epoch in range(20): # loop over the dataset multiple times running_loss = 0.0 index = 0 for i, data in enumerate(trainloader, 0): # get the inputs; data is a list of [inputs, labels] inputs, labels = data # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = model(inputs.double()) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.item() index += 1 print( "Epoch ", epoch, " (Index ", str(index), "/", str(len(trainloader)), " Loss : ", loss.item(), ")", ) test_accuracy = evaluate(model.double(), testloader, "Test Accuracy") if test_accuracy >= best_accuracy: save_model(model, optimizer, name="models/char_model.pth") print("loss: ", running_loss / len(trainloader)) print("Finished Training")
def adam_evaluate(neurons, lr, lr_decay, epochs, batch_size): # The Gaussian Process' space is continous, so we need to round some values neurons, epochs, batch_size = map(lambda x: int(round(x)), (neurons, epochs, batch_size)) # K-fold stratified cross-validation skf = StratifiedKFold(n_splits=10, shuffle=True) scores = [] for train_index, test_index in skf.split(features, labels): x_train, x_test = [features[i] for i in train_index ], [features[i] for i in test_index] y_train, y_test = to_categorical([ labels[i] for i in train_index ]), to_categorical([labels[i] for i in test_index]) # Create and fit the LSTM network model = get_spatial_model(layers=[neurons], lr=lr, lr_decay=lr_decay, input_shape=(len(x_train[0]), )) for _ in range(epochs): for X, Y in zip(x_train, y_train): model.train_on_batch(np.array([X]), np.array([Y])) # Final evaluation of the model evals = train_utils.evaluate(model, x_test, y_test) losses = [x[0] for x in evals] accuracies = [x[1] for x in evals] scores.append([np.mean(losses), np.mean(accuracies)]) losses = [x[0] for x in scores] accuracies = [x[1] for x in scores] print("Test loss and Standard dev: %.2f (+/- %.2f)" % (np.mean(losses), np.std(losses))) print("Test accuracy and Standard dev: %.2f%% (+/- %.2f%%)" % (np.mean(accuracies) * 100, np.std(accuracies) * 100)) return np.mean(accuracies)
def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") batch_size = args.batch_size # segmentation nun_classes + background num_classes = args.num_classes + 1 train_dataset = VOCSegmentation(args.data_path, transforms=get_transform(train=True), txt_name="train.txt") val_dataset = VOCSegmentation(args.data_path, transforms=get_transform(train=False), txt_name="val.txt") num_workers = 8 train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True, pin_memory=True, collate_fn=train_dataset.collate_fn) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=num_workers, pin_memory=True, collate_fn=val_dataset.collate_fn) model = create_model(aux=args.aux, num_classes=num_classes) model.to(device) params_to_optimize = [ {"params": [p for p in model.backbone.parameters() if p.requires_grad]}, {"params": [p for p in model.classifier.parameters() if p.requires_grad]} ] if args.aux: params = [p for p in model.aux_classifier.parameters() if p.requires_grad] params_to_optimize.append({"params": params, "lr": args.lr * 10}) optimizer = torch.optim.SGD( params_to_optimize, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay ) lr_scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lambda x: (1 - x / args.epochs) ** 0.9) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 start_time = time.time() for epoch in range(args.start_epoch, args.epochs): train_one_epoch(model, optimizer, train_loader, device, epoch, warmup=True, print_freq=args.print_freq) lr_scheduler.step() confmat = evaluate(model, val_loader, device=device, num_classes=num_classes) print(confmat) save_file = {"model": model.state_dict(), "optimizer": optimizer.state_dict(), "lr_scheduler": lr_scheduler.state_dict(), "epoch": epoch, "args": args} torch.save(save_file, "save_weights/model_{}.pth".format(epoch)) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print("training time {}".format(total_time_str))
X_train, Y_train, X_test, Y_test = read_cifar_10(image_width=INPUT_WIDTH, image_height=INPUT_HEIGHT) X = tf.placeholder(tf.float32, [None, 32, 32, 3]) Y = tf.placeholder(tf.float32, [None, 10]) dropout_rate = tf.placeholder("float") fix_model = AlexNet_cifar100(X, qnum=2, dropout_keep_prob=dropout_rate) param_list = fix_model.parameter_list tr_model = Train_Alexnet(X, param_list, dropout_keep_prob=dropout_rate) var_all = tf.trainable_variables(scope=None) hypothesis = tr_model.hypothesis correct_prediction = tf.equal(tf.argmax(hypothesis, 1), tf.arg_max(Y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32) , name='accuracy') with tf.device('/GPU:0'): with tf.Session() as sess: with tf.device('/cpu:0'): sess.run(tf.global_variables_initializer()) loader = tf.train.Saver(var_all) loader.restore(sess, tf.train.latest_checkpoint(CHECKPOINT)) final_train_accuracy = tu.evaluate(sess, accuracy, X, Y, dropout_rate, X_train, Y_train, BATCH_SIZE) final_test_accuracy = tu.evaluate(sess, accuracy, X, Y, dropout_rate, X_test, Y_test, BATCH_SIZE) print('Train Accuracy = {:.3f}'.format(final_train_accuracy)) print('Test Accuracy = {:.3f}'.format(final_test_accuracy)) print("")
def main(): parser = BasicConfig() model_type = vars(parser.parse_known_args()[0])["model_type"].lower() model_class, configs = MODEL_CLASSES[model_type] args = configs(parser) args = checkoutput_and_setcuda(args) logger = init_logger(args) logger.info('Dataset collected from {}'.format(args.data_dir)) # Set seed set_seed(args) processor = UbuntuCorpus(args) logger.info(args) model = model_class(args=args) # model.to(args.device) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) # Training if args.do_train: args.train_batch_size = args.per_gpu_train_batch_size * max( 1, args.n_gpu) train_dataloader = processor.create_batch(data_type="train") args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) eval_dataloader = processor.create_batch(data_type="eval") args.logging_steps = len( train_dataloader) // args.gradient_accumulation_steps // 5 args.valid_steps = len( train_dataloader) // args.gradient_accumulation_steps trainer_op = trainer(args=args, model=model, optimizer=optimizer, train_iter=train_dataloader, eval_iter=eval_dataloader, logger=logger, num_epochs=args.num_train_epochs, save_dir=args.output_dir, log_steps=args.logging_steps, valid_steps=args.valid_steps, valid_metric_name="+R10@1") trainer_op.train() print('training complete!') # Test if args.do_test: args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) test_dataloader = processor.create_batch(data_type="eval") trainer_op = trainer(args=args, model=model, optimizer=optimizer, train_iter=None, eval_iter=None, logger=logger, num_epochs=args.num_train_epochs, save_dir=args.output_dir, log_steps=None, valid_steps=None, valid_metric_name="+R10@1") best_model_file = os.path.join(args.output_dir, args.fusion_type + "_best.model") best_train_file = os.path.join(args.output_dir, args.fusion_type + "_best.train") trainer_op.load(best_model_file, best_train_file) evaluate(args, trainer_op.model, test_dataloader, logger) print('test complete') # TODO: Infer case study if args.do_infer: #不知道写什么,懒得想了。 pass
decay_thresh=0.99) elif args.model == "ridge_regression": monitor = ProgressMonitor(init_lr=args.learning_rate, lr_decay_fac=2.0, min_lr=0.00001, min_metric_better=True, decay_thresh=0.99) else: raise Exception("model not supported!") for epoch in range(args.epoch): # train for one epoch loss_per_step = train(args, model, epoch, train_loader, optimizer, quantizer, kernel_approx) train_loss += loss_per_step # evaluate and save evaluate metric metric, monitor_signal = evaluate(args, model, epoch, val_loader, quantizer, kernel_approx) eval_metric.append(metric) monitor_signal_history.append(monitor_signal) if not os.path.isdir(args.save_path): os.makedirs(args.save_path) np.savetxt(args.save_path + "/train_loss.txt", train_loss) np.savetxt(args.save_path + "/eval_metric.txt", eval_metric) np.savetxt(args.save_path + "/monitor_signal.txt", monitor_signal_history) if not args.fixed_epoch_number: print("using early stopping on lr") early_stop = monitor.end_of_epoch(monitor_signal, model, optimizer, epoch) if early_stop: break
def train(model, criterion, optimizer, optimizer_fp, train_iterator, n_epochs, n_batches, val_iterator, validation_step, n_validation_batches, saving_step=None, lr_scheduler=None): all_losses = [] all_models = [] is_reduce_on_plateau = isinstance(lr_scheduler, ReduceLROnPlateau) running_loss = 0.0 running_accuracy = 0.0 running_top5_accuracy = 0.0 start = time.time() model.train() for epoch in range(0, n_epochs): for step, (x_batch, y_batch) in enumerate(train_iterator, 1 + epoch * n_batches): if lr_scheduler is not None and not is_reduce_on_plateau: optimizer = lr_scheduler(optimizer, step) batch_loss, batch_accuracy, batch_top5_accuracy = optimization_step( model, criterion, optimizer, optimizer_fp, x_batch, y_batch) running_loss += batch_loss running_accuracy += batch_accuracy running_top5_accuracy += batch_top5_accuracy if step % validation_step == 0: model.eval() test_loss, test_accuracy, test_top5_accuracy = evaluate( model, criterion, val_iterator, n_validation_batches) end = time.time() print( '{0:.2f} {1:.3f} {2:.3f} {3:.3f} {4:.3f} {5:.3f} {6:.3f} {7:.3f}' .format(step / n_batches, running_loss / validation_step, test_loss, running_accuracy / validation_step, test_accuracy, running_top5_accuracy / validation_step, test_top5_accuracy, end - start)) all_losses += [ (step / n_batches, running_loss / validation_step, test_loss, running_accuracy / validation_step, test_accuracy, running_top5_accuracy / validation_step, test_top5_accuracy) ] if is_reduce_on_plateau: lr_scheduler.step(test_accuracy) running_loss = 0.0 running_accuracy = 0.0 running_top5_accuracy = 0.0 start = time.time() model.train() if saving_step is not None and step % saving_step == 0: print('saving') model.cpu() clone = copy.deepcopy(model) all_models += [clone.state_dict()] model.cuda() return all_losses, all_models
def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") batch_size = args.batch_size # segmentation nun_classes + background num_classes = args.num_classes + 1 # 用来保存训练以及验证过程中信息 results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) # VOCdevkit -> VOC2012 -> ImageSets -> Segmentation -> train.txt train_dataset = VOCSegmentation(args.data_path, year="2012", transforms=get_transform(train=True), txt_name="train.txt") # VOCdevkit -> VOC2012 -> ImageSets -> Segmentation -> val.txt val_dataset = VOCSegmentation(args.data_path, year="2012", transforms=get_transform(train=False), txt_name="val.txt") num_workers = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True, pin_memory=True, collate_fn=train_dataset.collate_fn) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=num_workers, pin_memory=True, collate_fn=val_dataset.collate_fn) model = create_model(aux=args.aux, num_classes=num_classes) model.to(device) params_to_optimize = [{ "params": [p for p in model.backbone.parameters() if p.requires_grad] }, { "params": [p for p in model.classifier.parameters() if p.requires_grad] }] if args.aux: params = [ p for p in model.aux_classifier.parameters() if p.requires_grad ] params_to_optimize.append({"params": params, "lr": args.lr * 10}) optimizer = torch.optim.SGD(params_to_optimize, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) scaler = torch.cuda.amp.GradScaler() if args.amp else None # 创建学习率更新策略,这里是每个step更新一次(不是每个epoch) lr_scheduler = create_lr_scheduler(optimizer, len(train_loader), args.epochs, warmup=True) # import matplotlib.pyplot as plt # lr_list = [] # for _ in range(args.epochs): # for _ in range(len(train_loader)): # lr_scheduler.step() # lr = optimizer.param_groups[0]["lr"] # lr_list.append(lr) # plt.plot(range(len(lr_list)), lr_list) # plt.show() if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.amp: scaler.load_state_dict(checkpoint["scaler"]) start_time = time.time() for epoch in range(args.start_epoch, args.epochs): mean_loss, lr = train_one_epoch(model, optimizer, train_loader, device, epoch, lr_scheduler=lr_scheduler, print_freq=args.print_freq, scaler=scaler) confmat = evaluate(model, val_loader, device=device, num_classes=num_classes) val_info = str(confmat) print(val_info) # write into txt with open(results_file, "a") as f: # 记录每个epoch对应的train_loss、lr以及验证集各指标 train_info = f"[epoch: {epoch}]\n" \ f"train_loss: {mean_loss:.4f}\n" \ f"lr: {lr:.6f}\n" f.write(train_info + val_info + "\n\n") save_file = { "model": model.state_dict(), "optimizer": optimizer.state_dict(), "lr_scheduler": lr_scheduler.state_dict(), "epoch": epoch, "args": args } if args.amp: save_file["scaler"] = scaler.state_dict() torch.save(save_file, "save_weights/model_{}.pth".format(epoch)) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print("training time {}".format(total_time_str))
def run(learning_rate, batch_size, cuda, memory_feature_size, num_inputs, num_outputs, controller_size, controller_type, controller_layers, memory_size, integer_shift, checkpoint_interval, total_batches, model_file): # model_file = "checkpoints/ntm/copy-batch-5120.0--LSTM.model" # Seeding SEED = 1000 torch.manual_seed(SEED) np.random.seed(SEED) # Model Loading if model_file == 'None': ntm = NTM(num_inputs=num_inputs, num_outputs=num_outputs, controller_size=controller_size, controller_type=controller_type, controller_layers=controller_layers, memory_size=memory_size, memory_feature_size=memory_feature_size, integer_shift=integer_shift, batch_size=batch_size, use_cuda=cuda) # Constants for keeping track total_examples = 0 losses = [] costs = [] seq_lens = [] else: from_before = torch.load(model_file) state_dict = from_before['state_dict'] controller_type = from_before['controller_type'] num_inputs = from_before['num_inputs'] num_outputs = from_before['num_outputs'] controller_size = from_before['controller_size'] controller_layers = from_before['controller_layers'] memory_size = from_before['memory_size'] memory_feature_size = from_before['memory_feature_size'] integer_shift = from_before['integer_shift'] batch_size = from_before['batch_size'] cuda = from_before['cuda'] saved_biases = True ntm = NTM(num_inputs=num_inputs, num_outputs=num_outputs, controller_size=controller_size, controller_type=controller_type, controller_layers=controller_layers, memory_size=memory_size, memory_feature_size=memory_feature_size, integer_shift=integer_shift, batch_size=batch_size, use_cuda=cuda, saved_biases=saved_biases) ntm.load_state_dict(state_dict) losses = from_before['loss'] costs = from_before['cost'] seq_lens = from_before['seq_lengths'] total_examples = from_before['total_examples'] # Dataset creation training_dataset = random_binary(max_seq_length=20, num_sequences=500, vector_dim=8, batch_Size=batch_size) testing_dataset = random_binary(max_seq_length=10, num_sequences=50, vector_dim=8, batch_Size=batch_size) # Optimizer type and loss function # optimizer = torch.optim.Adam(ntm.parameters(), lr=learning_rate) optimizer = torch.optim.RMSprop(ntm.parameters(), lr=learning_rate, momentum=0.9, alpha=0.95) criterion = torch.nn.BCELoss() np.random.seed( SEED ) # reset training seed to ensure that batches remain the same between runs! for batch in training_dataset: optimizer.zero_grad() # Initialize head weights and memory to zero ntm.init_headweights() ntm.init_memory() batch = Variable(batch) if cuda: batch = batch.cuda() next_r = ntm.read_head.create_state(batch_size) if controller_type == 'LSTM': lstm_h, lstm_c = ntm.controller.create_state(batch_size) # Read batch in for i in range(batch.size()[2]): x = batch[:, :, i] if controller_type == 'LSTM': _, next_r, lstm_h, lstm_c = ntm.forward(x=x, r=next_r, lstm_h=lstm_h, lstm_c=lstm_c) elif controller_type == 'MLP': _, next_r = ntm.forward(x=x, r=next_r) # Output response x = Variable(torch.zeros(batch.size()[0:2])) output = Variable(torch.zeros(batch[:, :, :-1].size())) if cuda: x = x.cuda() output = output.cuda() for i in range(output.size()[2]): if controller_type == 'LSTM': output[:, :, i], next_r, lstm_h, lstm_c = ntm.forward(x=x, r=next_r, lstm_h=lstm_h, lstm_c=lstm_c) elif controller_type == 'MLP': output[:, :, i], next_r = ntm.forward(x=x, r=next_r) loss = criterion(output, batch[:, :, :-1]) loss.backward(retain_graph=True) optimizer.step() print("Current Batch Loss:", round(loss.data[0], 3)) total_examples += batch_size # The cost is the number of error bits per sequence binary_output = output.clone().data binary_output = binary_output > 0.5 cost = torch.sum( torch.abs(binary_output.float() - batch.data[:, :, :-1])) losses += [loss.data[0]] costs += [cost / batch_size] seq_lens += [batch.size(2)] # Checkpoint model if (checkpoint_interval != 0) and (total_examples % checkpoint_interval == 0): print("Saving Checkpoint!") save_checkpoint(ntm, total_examples / batch_size, losses, costs, seq_lens, total_examples, controller_type, num_inputs, num_outputs, controller_size, controller_layers, memory_size, memory_feature_size, integer_shift, batch_size, cuda) # Evaluate model on this saved checkpoint test_cost, prediction, input = evaluate( model=ntm, testset=testing_dataset, batch_size=batch_size, memory_feature_size=memory_feature_size, controller_type=controller_type, cuda=cuda) print("Total Test Cost (in bits per sequence):", test_cost) print("Example of Input/Output") print("prediction:", prediction[0]) print("Input:", input[0]) if total_examples / checkpoint_interval >= total_batches: break
def trainEvalLM(args): fn = 'corpus.{}.data'.format(hashlib.md5(args.data.encode()).hexdigest()) if os.path.exists(fn): print('Loading cached dataset...') corpus = torch.load(fn) else: print('Producing dataset...') corpus = data.Corpus(args.data) torch.save(corpus, fn) if torch.cuda.is_available(): args.cuda = True ntokens = len(corpus.dictionary) eval_batch_size = 10 train_data = batchify(corpus.train, args.batch_size, args) val_data = batchify(corpus.valid, eval_batch_size, args) # Build the model and loss function model = lmModel.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied, g=args.g, k=args.k) criterion = nn.CrossEntropyLoss() if torch.cuda.is_available(): model = model.cuda() criterion = criterion.cuda() #compute network parameters params = list(model.parameters()) total_params = np.sum([np.prod(p.size()) for p in params]) print( '\033[1;32;40mTotal parameters (in million):\033[0m\033[1;31;40m {:0.2f} \033[0m\n' .format(total_params / 1e6, 2)) optimizer = torch.optim.SGD(params, lr=args.lr, weight_decay=args.wdecay) start_epoch = 1 if args.resume: print('Resuming model ...') model, criterion, optimizer, start_epoch = model_load(args.resume) optimizer.param_groups[0]['lr'] = args.lr model.dropout = args.dropout # At any point you can hit Ctrl + C to break out of training early. try: #Create folder for saving model and log files args.saveDir += '_' + args.model # ===================== if not os.path.isdir(args.saveDir): os.mkdir(args.saveDir) save_str = 'nl_' + str(args.nlayers) + '_nh_' + str( args.nhid) + '_g_' + str(args.g) + '_k_' + str(args.k) args.save = args.saveDir + '/model_' + save_str + '.pt' logFileLoc = args.saveDir + '/logs_' + save_str + '.txt' logger = open(logFileLoc, 'w') logger.write(str(args)) logger.write('\n Total parameters (in million): {:0.2f}'.format( total_params / 1e6, 2)) logger.write('\n\n') logger.write( "\n%s\t%s\t%s\t%s\t%s" % ('Epoch', 'Loss(Tr)', 'Loss(val)', 'ppl (tr)', 'ppl (val)')) logger.flush() best_val_loss = [] stored_loss = 100000000 # Loop over epochs. for epoch in range(start_epoch, args.epochs + 1): epoch_start_time = time.time() train_loss = train(args, model, criterion, optimizer, epoch, train_data, ntokens) ### TRAIN WITH ASGD if 't0' in optimizer.param_groups[0]: tmp = {} for prm in model.parameters(): tmp[prm] = prm.data.clone() prm.data = optimizer.state[prm]['ax'].clone() val_loss = evaluate(args, model, criterion, val_data, ntokens, eval_batch_size) print('-' * 89) print( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format( epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) print('-' * 89) logger.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f" % (epoch, train_loss, val_loss, math.exp(train_loss), math.exp(val_loss))) logger.flush() if val_loss < stored_loss: model_save(args.save, model, criterion, optimizer, epoch) print('Saving Averaged (new best validation)') stored_loss = val_loss for prm in model.parameters(): prm.data = tmp[prm].clone() else: val_loss = evaluate(args, model, criterion, val_data, ntokens, eval_batch_size) print('-' * 89) print( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format( epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) print('-' * 89) logger.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f" % (epoch, train_loss, val_loss, math.exp(train_loss), math.exp(val_loss))) logger.flush() if val_loss < stored_loss: model_save(args.save, model, criterion, optimizer, epoch) print('Saving model (new best validation)') stored_loss = val_loss if 't0' not in optimizer.param_groups[0] and ( len(best_val_loss) > args.nonmono and val_loss > min(best_val_loss[:-args.nonmono])): print('Switching to ASGD') optimizer = torch.optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay) best_val_loss.append(val_loss) except KeyboardInterrupt: print('-' * 89) print('Exiting from training early')
steps = tqdm(range(1, args.total_steps + 1)) for step in steps: steps.set_description( f'Best validation accuracy: {best_validation_accuracy:.3f}') try: supervised_batch = next(supervised_train_dataiter) except StopIteration: supervised_train_dataiter = iter(supervised_train_dataloader) supervised_batch = next(supervised_train_dataiter) try: unsupervised_batch = next(unsupervised_dataiter) except StopIteration: unsupervised_dataiter = iter(unsupervised_dataloader) unsupervised_batch = next(unsupervised_dataiter) optimizer.zero_grad() total_loss, supervised_loss, unsupervised_loss = train_utils.compute_loss( device, model, supervised_batch, unsupervised_batch, supervised_criterion, unsupervised_criterion, step, args) total_loss.backward() optimizer.step() if not step % args.evaluate_every: accuracy = train_utils.evaluate(device, model, supervised_validation_dataloader) if accuracy > best_validation_accuracy: best_validation_accuracy = accuracy torch.save(model.state_dict(), os.path.join(save_path, 'model.pt'))
def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") batch_size = args.batch_size # segmentation nun_classes + background num_classes = args.num_classes + 1 # using compute_mean_std.py mean = (0.709, 0.381, 0.224) std = (0.127, 0.079, 0.043) # 用来保存训练以及验证过程中信息 results_file = "results{}.txt".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) train_dataset = DriveDataset(args.data_path, train=True, transforms=get_transform(train=True, mean=mean, std=std)) val_dataset = DriveDataset(args.data_path, train=False, transforms=get_transform(train=False, mean=mean, std=std)) num_workers = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True, pin_memory=True, collate_fn=train_dataset.collate_fn) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=num_workers, pin_memory=True, collate_fn=val_dataset.collate_fn) model = create_model(num_classes=num_classes) model.to(device) params_to_optimize = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD( params_to_optimize, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay ) scaler = torch.cuda.amp.GradScaler() if args.amp else None # 创建学习率更新策略,这里是每个step更新一次(不是每个epoch) lr_scheduler = create_lr_scheduler(optimizer, len(train_loader), args.epochs, warmup=True) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.amp: scaler.load_state_dict(checkpoint["scaler"]) best_dice = 0. start_time = time.time() for epoch in range(args.start_epoch, args.epochs): mean_loss, lr = train_one_epoch(model, optimizer, train_loader, device, epoch, num_classes, lr_scheduler=lr_scheduler, print_freq=args.print_freq, scaler=scaler) confmat, dice = evaluate(model, val_loader, device=device, num_classes=num_classes) val_info = str(confmat) print(val_info) print(f"dice coefficient: {dice:.3f}") # write into txt with open(results_file, "a") as f: # 记录每个epoch对应的train_loss、lr以及验证集各指标 train_info = f"[epoch: {epoch}]\n" \ f"train_loss: {mean_loss:.4f}\n" \ f"lr: {lr:.6f}\n" \ f"dice coefficient: {dice:.3f}\n" f.write(train_info + val_info + "\n\n") if args.save_best is True: if best_dice < dice: best_dice = dice else: continue save_file = {"model": model.state_dict(), "optimizer": optimizer.state_dict(), "lr_scheduler": lr_scheduler.state_dict(), "epoch": epoch, "args": args} if args.amp: save_file["scaler"] = scaler.state_dict() if args.save_best is True: torch.save(save_file, "save_weights/best_model.pth") else: torch.save(save_file, "save_weights/model_{}.pth".format(epoch)) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print("training time {}".format(total_time_str))
y_source_domain) loss_target_domain = domain_loss_criterion(target_domain_pred, y_target_domain) domain_loss__ = loss_source_domain + loss_target_domain source_domain_loss.append(loss_source_domain) target_domain_loss.append(loss_target_domain) batch.append(batch__) loss = loss_source_label + domain_loss__ loss.backward() optimizer.step() print(f'[{batch__ + 1}/{max_batches}] ' f'class_loss: {loss_source_label.item():.4f} ' f'source_domain_loss: {loss_source_domain.item():.4f} ' f't_domain_loss: {loss_target_domain.item():.4f} ' f'lambda: {completion_lambda:.3f} ') writer.add_scalar('Class Loss', loss_source_label, batch__) writer.add_scalars( f'Domain_Loss', { 'Source Loss': source_domain_loss[batch__], 'Target Loss': target_domain_loss[batch__] }, batch__) acc_source = evaluate(source_eval_dataloader) writer.add_scalars(f'Source Domain Accuracy', {'Source': acc_source}, epoch) i += 1 writer.flush()
def run_training(model, cfg, test_features, test_labels, train_data, train_labels, val_data, val_labels): model_run_path = MODEL_PATH + "/" + strftime("%Y-%m-%d_%H:%M:%S", gmtime()) model_weights_path = "{}/{}".format(model_run_path, cfg.model_weights_name) model_config_path = "{}/{}".format(model_run_path, cfg.model_config_name) result_path = "{}/result.txt".format(model_run_path) os.makedirs(model_run_path, exist_ok=True) """Choosing hardware""" device = 'cuda' if torch.cuda.is_available() else 'cpu' if device == "cuda": print( "Using GPU. Setting default tensor type to torch.cuda.FloatTensor") torch.set_default_tensor_type("torch.cuda.FloatTensor") else: print("Using CPU. Setting default tensor type to torch.FloatTensor") torch.set_default_tensor_type("torch.FloatTensor") json.dump(cfg.to_json(), open(model_config_path, "w")) """Converting model to specified hardware and format""" model.float() model = model.to(device) """Defining loss and optimizer""" optimizer = torch.optim.Adam(model.parameters(), lr=cfg.lr) criterion = torch.nn.CrossEntropyLoss() criterion = criterion.to(device) """Creating data generators""" test_iterator = BatchIterator(test_features, test_labels, 100) train_iterator = BatchIterator(train_data, train_labels, cfg.batch_size) validation_iterator = BatchIterator(val_data, val_labels, 100) train_loss = 999 best_val_loss = 999 train_acc = 0 epochs_without_improvement = 0 """Running training""" for epoch in range(cfg.n_epochs): train_iterator.shuffle() if epochs_without_improvement == cfg.patience: break val_loss, val_acc, val_weighted_acc, conf_mat = evaluate( model, validation_iterator, criterion) if val_loss < best_val_loss: torch.save(model.state_dict(), model_weights_path) best_val_loss = val_loss best_val_acc = val_acc best_val_weighted_acc = val_weighted_acc best_conf_mat = conf_mat epochs_without_improvement = 0 log_success( " Epoch: {} | Val loss improved to {:.4f} | val acc: {:.3f} | weighted val acc: {:.3f} | train loss: {:.4f} | train acc: {:.3f} | saved model to {}." .format(epoch, best_val_loss, best_val_acc, best_val_weighted_acc, train_loss, train_acc, model_weights_path)) train_loss, train_acc, train_weighted_acc, _ = train( model, train_iterator, optimizer, criterion, cfg.reg_ratio) epochs_without_improvement += 1 if not epoch % 1: log( f'| Epoch: {epoch+1} | Val Loss: {val_loss:.3f} | Val Acc: {val_acc*100:.2f}% ' f'| Train Loss: {train_loss:.4f} | Train Acc: {train_acc*100:.3f}%', cfg.verbose) model.load_state_dict(torch.load(model_weights_path)) test_loss, test_acc, test_weighted_acc, conf_mat = evaluate( model, test_iterator, criterion) result = f'| Epoch: {epoch+1} | Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Weighted Test Acc: {test_weighted_acc*100:.2f}%\n Confusion matrix:\n {conf_mat}' log_major("Train acc: {}".format(train_acc)) log_major(result) log_major("Hyperparameters:{}".format(cfg.to_json())) with open(result_path, "w") as file: file.write(result)
shuffle=True, collate_fn=pad_collate) model = BilstmAspectAttPool(Configs1()) initialize_weights(model) print(model) model = model.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=lr) best_valid_loss = float('inf') for epoch in range(EPOCHS): start_time = time.time() train_loss = train(model, train_loader, optimizer, criterion, CLIP, device) valid_loss = evaluate(model, test_loader, criterion, device) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), model_name) print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s') print( f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}' ) print( f'\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}'
".pth", '_{}.pth'.format(args.load_epoch)) model.load_state_dict(torch.load(model_dir)) model.eval() results = pd.read_csv(pretrained_model_dir.replace(".pth", ".csv")) results = { col_name: list(results[col_name].values) for col_name in results.columns } stp = 1 + len(results['epochs']) if gan_cfg.evaluate: images_dir = os.path.join(saving_dir, 'images') if not os.path.exists(images_dir): os.makedirs(images_dir) pcc, ssim, mse, is_mean = evaluate(model, dataloader_valid, norm=True, mean=gan_cfg.mean, std=gan_cfg.std, path=images_dir) print("Mean PCC:", pcc) print("Mean SSIM:", ssim) print("Mean MSE:", mse) print("IS mean", is_mean) exit(0) else: logging.info('Initialize') stp = 1 results = dict(epochs=[], loss_encoder=[], loss_decoder=[]) # An optimizer for each of the sub-networks, so we can selectively backpropogate optimizer_encoder = torch.optim.RMSprop(params=model.encoder.parameters(),
args = parser.parse_args() cifar_dir = args.cifar_root fig_path = args.fig_path validation_split = args.val_split batch_size = args.batch_size epochs = args.epochs weight_path = args.weight_path weight_decay = args.weight_decay lr = args.lr SEED = args.seed # set random seed (default as 1234) # split train, val, test from `get_data` function train_loader, val_loader, test_loader = get_data(cifar_dir=cifar_dir, batch_size=batch_size, augment=True, validation_split=validation_split) # load model model = VGG_lite() # define loss loss = nn.CrossEntropyLoss() # train the model model, history = train(model, train_loader, val_loader, epochs, loss, batch_size, optimizer='adam', weight_decay=weight_decay, lr=lr) # save the model accordeing to `weight_path` from parser (default to './weights/final.pth') torch.save(model.state_dict(), weight_path) plot_history(history, fig_path) # save figures acc, cm, cm_norm = evaluate(model, test_loader) # evaluate trained model plot_cm(cm, cm_norm, fig_path) # save confusion matrix figures print('Test Accuracy: {}%'.format(round(acc*100, 4))) # print the model test accuracy
def main(args): init_distributed_mode(args) print(args) device = torch.device(args.device) # segmentation nun_classes + background num_classes = args.num_classes + 1 mean = (0.709, 0.381, 0.224) std = (0.127, 0.079, 0.043) # 用来保存coco_info的文件 results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) data_root = args.data_path # check data root if os.path.exists(os.path.join(data_root, "DRIVE")) is False: raise FileNotFoundError( "DRIVE dose not in path:'{}'.".format(data_root)) train_dataset = DriveDataset(args.data_path, train=True, transforms=get_transform(train=True, mean=mean, std=std)) val_dataset = DriveDataset(args.data_path, train=False, transforms=get_transform(train=False, mean=mean, std=std)) print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) test_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset) else: train_sampler = torch.utils.data.RandomSampler(train_dataset) test_sampler = torch.utils.data.SequentialSampler(val_dataset) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=train_sampler, num_workers=args.workers, collate_fn=train_dataset.collate_fn, drop_last=True) val_data_loader = torch.utils.data.DataLoader( val_dataset, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=train_dataset.collate_fn) print("Creating model") # create model num_classes equal background + foreground classes model = create_model(num_classes=num_classes) model.to(device) if args.sync_bn: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params_to_optimize = [ p for p in model_without_ddp.parameters() if p.requires_grad ] optimizer = torch.optim.SGD(params_to_optimize, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) scaler = torch.cuda.amp.GradScaler() if args.amp else None # 创建学习率更新策略,这里是每个step更新一次(不是每个epoch) lr_scheduler = create_lr_scheduler(optimizer, len(train_data_loader), args.epochs, warmup=True) # 如果传入resume参数,即上次训练的权重地址,则接着上次的参数训练 if args.resume: # If map_location is missing, torch.load will first load the module to CPU # and then copy each parameter to where it was saved, # which would result in all processes on the same machine using the same set of devices. checkpoint = torch.load( args.resume, map_location='cpu') # 读取之前保存的权重文件(包括优化器以及学习率策略) model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.amp: scaler.load_state_dict(checkpoint["scaler"]) if args.test_only: confmat = evaluate(model, val_data_loader, device=device, num_classes=num_classes) val_info = str(confmat) print(val_info) return best_dice = 0. print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) mean_loss, lr = train_one_epoch(model, optimizer, train_data_loader, device, epoch, num_classes, lr_scheduler=lr_scheduler, print_freq=args.print_freq, scaler=scaler) confmat, dice = evaluate(model, val_data_loader, device=device, num_classes=num_classes) val_info = str(confmat) print(val_info) print(f"dice coefficient: {dice:.3f}") # 只在主进程上进行写操作 if args.rank in [-1, 0]: # write into txt with open(results_file, "a") as f: # 记录每个epoch对应的train_loss、lr以及验证集各指标 train_info = f"[epoch: {epoch}]\n" \ f"train_loss: {mean_loss:.4f}\n" \ f"lr: {lr:.6f}\n" \ f"dice coefficient: {dice:.3f}\n" f.write(train_info + val_info + "\n\n") if args.save_best is True: if best_dice < dice: best_dice = dice else: continue if args.output_dir: # 只在主节点上执行保存权重操作 save_file = { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch } if args.amp: save_file["scaler"] = scaler.state_dict() if args.save_best is True: save_on_master(save_file, os.path.join(args.output_dir, 'best_model.pth')) else: save_on_master( save_file, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
linguistic_model = AttentionModel(linguistic_cfg) linguistic_model.float().to(device) try: linguistic_model.load_state_dict(torch.load(args.linguistic_model)) except: print( "Failed to load model from {} without device mapping. Trying to load with mapping to {}" .format(args.linguistic_model, device)) linguistic_model.load_state_dict( torch.load(args.linguistic_model, map_location=device)) """Defining loss and optimizer""" criterion = torch.nn.CrossEntropyLoss().to(device) test_loss, test_acc, test_weighted_acc, conf_mat = evaluate( acoustic_model, test_iterator_acoustic, criterion) print("Acoustic: loss: {}, acc: {}. unweighted acc: {}, conf_mat: \n{}". format(test_loss, test_acc, test_weighted_acc, conf_mat)) test_loss, test_acc, test_weighted_acc, conf_mat = evaluate( linguistic_model, test_iterator_linguistic, criterion) print( "Linguistic(asr=False): loss: {}, acc: {}. unweighted acc: {}, conf_mat: \n{}" .format(test_loss, test_acc, test_weighted_acc, conf_mat)) test_loss, test_acc, test_weighted_acc, conf_mat = evaluate_ensemble( acoustic_model, linguistic_model, test_iterator_acoustic, test_iterator_linguistic, torch.nn.NLLLoss().to(device), "average") print( "Ensemble average: loss: {}, acc: {}. unweighted acc: {}, conf_mat: \n{}"
def main(): parser = argparse.ArgumentParser(description='gpat train ') parser.add_argument("out") parser.add_argument('--resume', default=None) parser.add_argument('--log_dir', default='runs_16') parser.add_argument('--gpus', '-g', type=int, nargs="*", default=[0, 1, 2, 3]) parser.add_argument('--iterations', default=10**5, type=int, help='number of iterations to learn') parser.add_argument('--interval', default=1000, type=int, help='number of iterations to evaluate') parser.add_argument('--batch_size', '-b', type=int, default=128, help='learning minibatch size') parser.add_argument('--lr', type=float, default=1e-4) parser.add_argument('--loaderjob', type=int, default=8) # parser.add_argument('--size', '-s', default=96, type=int, choices=[48, 64, 80, 96, 112, 128], # help='image size') parser.add_argument('--hed', dest='hed', action='store_true', default=False) parser.add_argument('--from_tiff', dest='from_tiff', action='store_true', default=False) parser.add_argument('--no-texture', dest='texture', action='store_false', default=True) parser.add_argument('--cbp', dest='cbp', action='store_true', default=False) parser.add_argument('--no-normalize', dest='normalize', action='store_false', default=True) parser.add_argument('--no-color_aug', dest='color_aug', action='store_false', default=True) parser.add_argument('--model_test', default='', type=str) parser.add_argument('--no-finetune', dest='finetune', action='store_false', default=True) parser.add_argument('--arch', default='googlenet', choices=[ 'texturecnn', 'resnet50', 'googlenet', 'vgg', 'alex', 'trained', 'resume' ]) parser.add_argument('--opt', default='adam', choices=['adam', 'momentum']) parser.add_argument('--train_path', default='train_0330_additional_new.npy') parser.add_argument('--data_size', type=float, default=1) parser.add_argument('--test_path', default='diag_256_0406.pkl') parser.add_argument('--new', action='store_true', default=False) args = parser.parse_args() devices = tuple(args.gpus) # os.environ['PATH'] += ':/usr/local/cuda/bin' # log directory logger.init(args) # load data train_data = np.load(os.path.join(dataset_path, args.train_path)) test_data = np.load(os.path.join(dataset_path, args.test_path)) num_class = 3 if 'three_class' in args.train_path else 2 if '512' in args.train_path: image_size = 512 crop_size = 384 else: image_size = 256 crop_size = 224 if not args.arch == 'alex' else 227 perm = np.random.permutation(len(train_data)) train_data = train_data[perm[:int(len(train_data) * args.data_size)]] preprocess_type = args.arch if not args.hed else 'hed' train = CamelyonDataset(train_data, original_size=image_size, crop_size=crop_size, aug=True, color_aug=args.color_aug, num_class=num_class, from_tif=False, preprocess_type=preprocess_type) if len(devices) > 1: train_iter = [ chainer.iterators.MultiprocessIterator(i, args.batch_size, n_processes=args.loaderjob) for i in chainer.datasets.split_dataset_n_random( train, len(devices)) ] else: train_iter = iterators.MultiprocessIterator(train, args.batch_size, n_processes=args.loaderjob) diag_iter = {} for diag in test_data: image_size = int(diag.split('_')[-1]) test = CamelyonDataset(test_data[diag], original_size=image_size, crop_size=crop_size, aug=False, color_aug=False, num_class=num_class, from_tif=False, preprocess_type=preprocess_type, texture=args.texture) diag_iter[diag] = iterators.MultiprocessIterator(test, args.batch_size, repeat=False, shuffle=False) # model construct if args.new: if args.texture: model = BilinearCNN(base_cnn=args.arch, pretrained_model='auto', num_class=num_class, texture_layer=None, cbp=args.cbp, cbp_size=4096) else: model = TrainableCNN(base_cnn=args.arch, pretrained_model='auto', num_class=num_class) else: model = archs[args.arch](texture=args.texture, cbp=args.cbp, normalize=args.normalize) if args.finetune: model.load_pretrained( os.path.join(MODEL_PATH, init_path[args.arch]), num_class) else: model.convert_to_finetune_model(num_class) if args.resume is not None: model_path = os.path.join( 'runs_16', args.resume, 'models', sorted(os.listdir(os.path.join('runs_16', args.resume, 'models')))[-1]) print(model_path) chainer.serializers.load_npz(model_path, model) # set optimizer optimizer = make_optimizer(model, args.opt, args.lr) if args.model_test: # test model_path = os.path.join( 'runs_16', args.model_test, 'models', sorted( os.listdir(os.path.join('runs_16', args.model_test, 'models')))[-1]) print(model_path) chainer.serializers.load_npz(model_path, model) cuda.get_device_from_id(devices[0]).use() model.to_gpu() with chainer.using_config('train', False), chainer.no_backprop_mode(): evaluate(model, diag_iter, devices[0]) logger.flush() exit() if len(devices) > 1: updater = updaters.MultiprocessParallelUpdater(train_iter, optimizer, devices=devices) else: cuda.get_device_from_id(devices[0]).use() model.to_gpu() # updater updater = chainer.training.StandardUpdater(train_iter, optimizer, device=devices[0]) # start training start = time.time() train_loss = 0 train_accuracy = 0 while updater.iteration < args.iterations: # train updater.update() progress_report(updater.iteration, start, len(devices) * args.batch_size, len(train)) train_loss += model.loss.data train_accuracy += model.accuracy.data if updater.iteration % args.interval == 0: logger.plot('train_loss', cuda.to_cpu(train_loss) / args.interval) logger.plot('train_accuracy', cuda.to_cpu(train_accuracy) / args.interval) train_loss = 0 train_accuracy = 0 # test with chainer.using_config('train', False), chainer.no_backprop_mode(): evaluate(model, diag_iter, devices[0]) # logger logger.flush() # save serializers.save_npz(os.path.join(logger.out_dir, 'resume'), updater) if updater.iteration % 20000 == 0: if args.opt == 'adam': optimizer.alpha *= 0.1 else: optimizer.lr *= 0.1
# load saved model print('\nLoading model from [%s]...' % args.snapshot) try: model = torch.load(args.snapshot) except Exception as e: print(e) exit(1) print("Load complete") # Train the model on train_data, use dev_data for early stopping model, dev_res = train_utils.train(train_data, dev_data, model, args) # Evaluate the trained model print("Evaluate on train set") train_res = train_utils.evaluate(train_data, model, args) print("Evaluate on test set") test_res = train_utils.evaluate(test_data, model, args, roc=True) if args.result_path: directory = args.result_path[:args.result_path.rfind('/')] if not os.path.exists(directory): os.makedirs(directory) result = { 'train_loss': train_res[0], 'train_acc': train_res[1], 'train_recall': train_res[2], 'train_precision': train_res[3], 'train_f1': train_res[4],
# train the network and early stop by dev loss dev_res = train_utils.train(train_data, dev_data, model, args) if args.save: with open(dev_res[-1] + '.vocab', 'wb') as f: pickle.dump(vocab, f, pickle.HIGHEST_PROTOCOL) prefix = 'results_rat/' + args.dataset + '/' if not os.path.exists(prefix): os.makedirs(prefix) print("Evaluate on train set") writer = data_utils.generate_writer( prefix + 'sel_'+str(args.l_selection) + '_target_' + str(args.l_selection_target) +\ '_var_' + str(args.l_variation) + '.train') train_res = train_utils.evaluate(train_data, model, args, writer) data_utils.close_writer(writer) print("Evaluate on dev set") writer = data_utils.generate_writer( prefix + 'sel_'+str(args.l_selection) + '_target_' + str(args.l_selection_target) +\ '_var_' + str(args.l_variation) + '.dev') dev_res = train_utils.evaluate(dev_data, model, args, writer) data_utils.close_writer(writer) if args.result_path: result = { 'train_loss': train_res[0], 'train_acc': train_res[1], 'train_recall': train_res[2], 'train_precision': train_res[3],