def run(): print 'Inside run!' global args args = parser.parse_args() print args # add checkpoint resume option # load datasets train_dataset = SketchData(root=path, train=True, transform=None, target_transform=None, ) val_dataset = SketchData(root=path, train=False, transform=None, target_transform=None, ) train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.b, shuffle=True) val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=args.b, shuffle=False) model = AlexNet() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.wd) criterion = nn.CrossEntropyLoss() best_prec = 0 for epoch in range(args.epochs): print 'Epoch: ' + str(epoch) adjust_learning_rate(optimizer, epoch) print 'Adjusted learning rate' train(train_loader, model, criterion, optimizer, epoch) print 'Trained!' precision = validate(val_loader, model, criterion) print 'Got precision!' best_prec = max(precision.data[0], best_prec) print 'Updated best precision!' save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'best_prec1': best_prec, 'state_dict': model.state_dict(), 'optimizer' : optimizer.state_dict(), }, (precision.data[0] > best_prec))
def run_experiment(args): torch.manual_seed(args.seed) if not args.no_cuda: torch.cuda.manual_seed(args.seed) # Dataset if args.dataset == 'mnist': train_loader, test_loader, _, val_data = prepare_mnist(args) else: create_val_img_folder(args) train_loader, test_loader, _, val_data = prepare_imagenet(args) idx_to_class = {i: c for c, i in val_data.class_to_idx.items()} # Model & Criterion if args.model == 'AlexNet': if args.pretrained: model = models.__dict__['alexnet'](pretrained=True) # Change the last layer in_f = model.classifier[-1].in_features model.classifier[-1] = nn.Linear(in_f, args.classes) else: model = AlexNet(args.classes) criterion = nn.CrossEntropyLoss(size_average=False) else: model = SVM(args.features, args.classes) criterion = MultiClassHingeLoss(margin=args.margin, size_average=False) if not args.no_cuda: model.cuda() # Load saved model and test on it if args.load: model.load_state_dict(torch.load(args.model_path)) val_acc = test(model, criterion, test_loader, 0, [], [], idx_to_class, args) # Optimizer if args.optimizer == 'adam': optimizer = optim.Adam(model.parameters()) else: optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) total_minibatch_count = 0 val_acc = 0 train_losses, train_accs = [], [] val_losses, val_accs = [], [] # Train and test for epoch in range(1, args.epochs + 1): total_minibatch_count = train(model, criterion, optimizer, train_loader, epoch, total_minibatch_count, train_losses, train_accs, args) val_acc = test(model, criterion, test_loader, epoch, val_losses, val_accs, idx_to_class, args) # Save model if args.save: if not os.path.exists(args.models_dir): os.makedirs(args.models_dir) filename = '_'.join( [args.prefix, args.dataset, args.model, 'model.pt']) torch.save(model.state_dict(), os.path.join(args.models_dir, filename)) # Plot graphs fig, axes = plt.subplots(1, 4, figsize=(13, 4)) axes[0].plot(train_losses) axes[0].set_title('Loss') axes[1].plot(train_accs) axes[1].set_title('Acc') axes[1].set_ylim([0, 1]) axes[2].plot(val_losses) axes[2].set_title('Val loss') axes[3].plot(val_accs) axes[3].set_title('Val Acc') axes[3].set_ylim([0, 1]) # Images don't show on Ubuntu # plt.tight_layout() # Save results if not os.path.exists(args.results_dir): os.makedirs(args.results_dir) filename = '_'.join([args.prefix, args.dataset, args.model, 'plot.png']) fig.suptitle(filename) fig.savefig(os.path.join(args.results_dir, filename))
def run(): global args args = parser.parse_args() if args.resume: print 'Resuming from checkpoint!' if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # load datasets, split train and test ''' train_dataset = SketchData(root=path, train=True, transform=transforms.ToTensor(), target_transform=transforms.ToTensor(), ) test_dataset = SketchData(root=path, train=False, transform=transforms.ToTensor(), target_transform=transforms.ToTensor(), ) ''' train_dataset = SketchData( root=path, train=True, transform=None, target_transform=None, ) val_dataset = SketchData( root=path, train=False, transform=None, target_transform=None, ) train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.b, shuffle=True) val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=args.b, shuffle=False) # create model, set parameters, optimiser, loss model = AlexNet() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.wd) criterion = nn.CrossEntropyLoss() best_prec = 0 for epoch in range(args.epochs): adjust_learning_rate(optimizer, epoch) train(train_loader, model, criterion, optimizer, epoch) precision = validate(val_loader, model, criterion) best_prec = max(precision.data[0], best_prec) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'best_prec1': best_prec, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, (precision.data[0] > best_prec))
class TrainNetwork(): def __init__(self, dataset, batch_size, epochs, lr, lr_decay_epoch, momentum): assert (dataset == 'letters' or dataset == 'mnist') self.dataset = dataset self.batch_size = batch_size self.epochs = epochs self.lr = lr self.lr_decay_epoch = lr_decay_epoch self.momentum = momentum # letters contains 27 classes, digits contains 10 classes num_classes = 27 if dataset == 'letters' else 10 # Load pre learned AlexNet with changed number of output classes state_dict = torch.load('./trained_models/alexnet.pth') state_dict['classifier.6.weight'] = torch.zeros(num_classes, 4096) state_dict['classifier.6.bias'] = torch.zeros(num_classes) self.model = AlexNet(num_classes) self.model.load_state_dict(state_dict) # Use cuda if available if torch.cuda.is_available(): self.model.cuda() # Load training dataset kwargs = { 'num_workers': 1, 'pin_memory': True } if torch.cuda.is_available() else {} self.train_loader = torch.utils.data.DataLoader( EMNIST('./data', dataset, download=True, transform=transforms.Compose([ transforms.Lambda(correct_rotation), transforms.Lambda(random_transform), transforms.Resize((224, 224)), transforms.RandomResizedCrop(224, (0.9, 1.1), ratio=(0.9, 1.1)), transforms.Grayscale(3), transforms.ToTensor(), ])), batch_size=batch_size, shuffle=True, **kwargs) # Optimizer and loss function self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr, momentum=self.momentum) self.loss_fn = nn.CrossEntropyLoss() def reduce_learning_rate(self, epoch): """ Reduce the learning rate by factor 0.1 every lr_decay_epoch :param optimizer: Optimizer containing the learning rate :param epoch: Current epoch :param init_lr: Initial learning rate :param lr_decay_epoch: Number of epochs until learning rate gets reduced :return: None """ lr = self.lr * (0.1**(epoch // self.lr_decay_epoch)) if epoch % self.lr_decay_epoch == 0: print('LR is set to {}'.format(lr)) for param_group in self.optimizer.param_groups: param_group['lr'] = lr def train(self, epoch): """ Train the model for one epoch and save the result as a .pth file :param epoch: Current epoch :return: None """ self.model.train() train_loss = 0 train_correct = 0 progress = None for batch_idx, (data, target) in enumerate(self.train_loader): # Get data and label if torch.cuda.is_available(): data, target = data.cuda(), target.cuda() data, target = Variable(data), Variable(target) # Optimize using backpropagation self.optimizer.zero_grad() output = self.model(data) loss = self.loss_fn(output, target) train_loss += loss.data[0] pred = output.data.max(1, keepdim=True)[1] train_correct += pred.eq(target.data.view_as(pred)).sum() loss.backward() self.optimizer.step() # Print information about current step current_progress = int(100 * (batch_idx + 1) * self.batch_size / len(self.train_loader.dataset)) if current_progress is not progress and current_progress % 5 == 0: progress = current_progress print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, (batch_idx + 1) * len(data), len(self.train_loader.dataset), current_progress, loss.data[0])) train_loss /= (len(self.train_loader.dataset) / self.batch_size) train_correct /= len(self.train_loader.dataset) train_correct *= 100 # Print information about current epoch print( 'Train Epoch: {} \tCorrect: {:3.2f}%\tAverage loss: {:.6f}'.format( epoch, train_correct, train_loss)) # Save snapshot torch.save( { 'model': self.model.state_dict(), 'optimizer': self.optimizer.state_dict() }, './trained_models/{}_{}.pth'.format(self.dataset, epoch)) def start(self): """ Start training the network :return: None """ for epoch in range(1, self.epochs + 1): self.reduce_learning_rate(epoch) self.train(epoch)
optimizer = torch.optim.Adam(model.parameters(), args.lr) # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) criterion = nn.CrossEntropyLoss() config = wandb.config # wandb config specification config.learning_rate = args.lr config.batch_size = args.batch_size config.model = args.model logging.info("Training...") train_loss, train_acc, test_acc, test_loss = train_for_classification( net=model, dataset=train_dataset, batch_size=args.batch_size, optimizer=optimizer, criterion=criterion, epochs=args.epochs) fig_metrics = plot_metrics(train_loss, train_acc, test_loss, test_acc, f"{model.__class__.__name__}_metrics.png") wandb.log({'metrics': wandb.Image(fig_metrics)}) wandb.log({ 'metrics_file': wandb.Image(f"{model.__class__.__name__}_metrics.png") }) logging.info("Saving...") model_name = f"last_{model.__class__.__name__}.pth" torch.save(model.state_dict(), model_name) wandb.save(model_name)
dataset = Rand_num() sampler = RandomSampler(dataset) loader = DataLoader(dataset, batch_size=20, sampler=sampler, shuffle=False, num_workers=1, drop_last=True) net = AlexNet(3) #net.load_state_dict(torch.load(SAVE_PATH)) net.cuda() optimizer = optim.Adam(net.parameters(), lr=0.001) for epoch in range(10000): for i, data in enumerate(loader, 0): net.zero_grad() video, labels = data video = video.view(-1, 3, 227, 227) labels = labels.view(-1, 3) labels = torch.squeeze(Variable(labels.float().cuda())) video = torch.squeeze(Variable((video.float() / 256).cuda())) net.train() outputs = net.forward(video) loss = lossfunction(outputs, labels) loss.backward() optimizer.step() if i == 0: torch.save(net.state_dict(), SAVE_PATH) print(loss) logger.scalar_summary('loss', loss.data.cpu().numpy(), epoch)
outputs = model(images) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss = loss.item() running_error = (outputs.max(dim=1)[1] != labels).sum().item() if i % 100 == 99: # Print every 100 mini-batches print('Epoch / Batch [%d / %d] - Loss: %.3f - Error: %.3f' % (epoch + 1, i + 1, running_loss / 100, running_error / 100)) if epoch % args.save_interval == (args.save_interval - 1): model_name = get_model_name(args.run_number, epoch) torch.save(model.state_dict(), CKPT_DIR + model_name + ".ckpt") text = "SAVED" print(F"{text:>20} {model_name}.ckpt") # Use save interval as test interval too for now test_model(model, epoch)
def train_generic_model(model_name="alexnet", dataset="custom", num_classes=-1, batch_size=8, is_transform=1, num_workers=2, lr_decay=1, l2_reg=0, hdf5_path="dataset-bosch-224x224.hdf5", trainset_dir="./TRAIN_data_224_v8", testset_dir="./TEST_data_224_v8", convert_grey=False): CHKPT_PATH = "./checkpoint_{}.PTH".format(model_name) print("CUDA:") print(torch.cuda.is_available()) if is_transform: trans_ls = [] if convert_grey: trans_ls.append(transforms.Grayscale(num_output_channels=1)) trans_ls.extend([ transforms.Resize((224, 224)), # transforms.RandomCrop((224, 224)), # transforms.Grayscale(num_output_channels=1), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) transform = transforms.Compose(trans_ls) else: transform = None print("DATASET FORMAT: {}".format(dataset)) print("TRAINSET PATH: {}".format(trainset_dir)) print("TESTSET PATH: {}".format(testset_dir)) print("HDF5 PATH: {}".format(hdf5_path)) if dataset == "custom": trainset = torchvision.datasets.ImageFolder(root=trainset_dir, transform=transform) train_size = len(trainset) testset = torchvision.datasets.ImageFolder(root=testset_dir, transform=transform) test_size = len(testset) elif dataset == "cifar": trainset = torchvision.datasets.CIFAR10(root="CIFAR_TRAIN_data", train=True, download=True, transform=transform) train_size = len(trainset) testset = torchvision.datasets.CIFAR10(root="CIFAR_TEST_data", train=False, download=True, transform=transform) test_size = len(testset) elif dataset == "hdf5": if num_workers == 1: trainset = Hdf5Dataset(hdf5_path, transform=transform, is_test=False) else: trainset = Hdf5DatasetMPI(hdf5_path, transform=transform, is_test=False) train_size = len(trainset) if num_workers == 1: testset = Hdf5Dataset(hdf5_path, transform=transform, is_test=True) else: testset = Hdf5DatasetMPI(hdf5_path, transform=transform, is_test=True) test_size = len(testset) train_loader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers) test_loader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=True, num_workers=num_workers) if model_name == "alexnet": net = AlexNet(num_classes=num_classes) elif model_name == "lenet5": net = LeNet5(num_classes=num_classes) elif model_name == "stn-alexnet": net = STNAlexNet(num_classes=num_classes) elif model_name == "stn-lenet5": net = LeNet5STN(num_classes=num_classes) elif model_name == "capsnet": net = CapsuleNet(num_classes=num_classes) elif model_name == "convneta": net = ConvNetA(num_classes=num_classes) elif model_name == "convnetb": net = ConvNetB(num_classes=num_classes) elif model_name == "convnetc": net = ConvNetC(num_classes=num_classes) elif model_name == "convnetd": net = ConvNetD(num_classes=num_classes) elif model_name == "convnete": net = ConvNetE(num_classes=num_classes) elif model_name == "convnetf": net = ConvNetF(num_classes=num_classes) elif model_name == "convnetg": net = ConvNetG(num_classes=num_classes) elif model_name == "convneth": net = ConvNetH(num_classes=num_classes) elif model_name == "convneti": net = ConvNetI(num_classes=num_classes) elif model_name == "convnetj": net = ConvNetJ(num_classes=num_classes) elif model_name == "convnetk": net = ConvNetK(num_classes=num_classes) elif model_name == "convnetl": net = ConvNetL(num_classes=num_classes) elif model_name == "convnetm": net = ConvNetM(num_classes=num_classes) elif model_name == "convnetn": net = ConvNetN(num_classes=num_classes) elif model_name == "resnet18": net = models.resnet18(pretrained=False, num_classes=num_classes) print(net) if torch.cuda.is_available(): net = net.cuda() if model_name == "capsnet": criterion = CapsuleLoss() else: criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=LEARNING_RATE, momentum=0.9, weight_decay=l2_reg) if lr_decay: scheduler = ReduceLROnPlateau(optimizer, 'min') best_acc = 0 from_epoch = 0 if os.path.exists(CHKPT_PATH): print("Checkpoint Found: {}".format(CHKPT_PATH)) state = torch.load(CHKPT_PATH) net.load_state_dict(state['state_dict']) optimizer.load_state_dict(state['optimizer']) best_acc = state['best_accuracy'] from_epoch = state['epoch'] for epoch in range(from_epoch, NUM_EPOCHS): #print("Epoch: {}/{}".format(epoch + 1, NUM_EPOCHS)) epoch_loss = 0 correct = 0 for i, data in enumerate(train_loader, 0): #print("Train \t Epoch: {}/{} \t Batch: {}/{}".format(epoch + 1, # NUM_EPOCHS, # i + 1, # ceil(train_size / BATCH_SIZE))) inputs, labels = data inputs, labels = Variable(inputs).type(torch.FloatTensor),\ Variable(labels).type(torch.LongTensor) if model_name == "capsnet": inputs = augmentation(inputs) ground_truth = torch.eye(num_classes).index_select( dim=0, index=labels) if torch.cuda.is_available(): inputs = inputs.cuda() labels = labels.cuda() optimizer.zero_grad() if model_name == "capsnet": classes, reconstructions = net(inputs, ground_truth) loss = criterion(inputs, ground_truth, classes, reconstructions) else: outputs = net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() epoch_loss += loss.data[0] if model_name != "capsnet": log_outputs = F.softmax(outputs, dim=1) else: log_outputs = classes pred = log_outputs.data.max(1, keepdim=True)[1] correct += pred.eq(labels.data.view_as(pred)).sum() print( "Epoch: {} \t Training Loss: {:.4f} \t Training Accuracy: {:.2f} \t {}/{}" .format(epoch + 1, epoch_loss / train_size, 100 * correct / train_size, correct, train_size)) correct = 0 test_loss = 0 for i, data in enumerate(test_loader, 0): # print("Test \t Epoch: {}/{} \t Batch: {}/{}".format(epoch + 1, # NUM_EPOCHS, # i + 1, # ceil(test_size / BATCH_SIZE))) inputs, labels = data inputs, labels = Variable(inputs).type( torch.FloatTensor), Variable(labels).type(torch.LongTensor) if model_name == "capsnet": inputs = augmentation(inputs) ground_truth = torch.eye(num_classes).index_select( dim=0, index=labels) if torch.cuda.is_available(): inputs = inputs.cuda() labels = labels.cuda() if model_name == "capsnet": classes, reconstructions = net(inputs) loss = criterion(inputs, ground_truth, classes, reconstructions) else: outputs = net(inputs) loss = criterion(outputs, labels) test_loss += loss.data[0] if model_name != "capsnet": log_outputs = F.softmax(outputs, dim=1) else: log_outputs = classes pred = log_outputs.data.max(1, keepdim=True)[1] correct += pred.eq(labels.data.view_as(pred)).sum() print( "Epoch: {} \t Testing Loss: {:.4f} \t Testing Accuracy: {:.2f} \t {}/{}" .format(epoch + 1, test_loss / test_size, 100 * correct / test_size, correct, test_size)) if correct >= best_acc: if not os.path.exists("./models"): os.mkdir("./models") torch.save( net.state_dict(), "./models/model-{}-{}-{}-{}-val-acc-{:.2f}-train-{}-test-{}-epoch-{}.pb" .format(model_name, dataset, hdf5_path, str(datetime.now()), 100 * correct / test_size, trainset_dir.replace(" ", "_").replace("/", "_"), testset_dir.replace(" ", "_").replace("/", "_"), epoch + 1)) best_acc = max(best_acc, correct) # save checkpoint path state = { 'epoch': epoch, 'state_dict': net.state_dict(), 'optimizer': optimizer.state_dict(), 'best_accuracy': best_acc } torch.save(state, CHKPT_PATH) if lr_decay: # Note that step should be called after validate() scheduler.step(test_loss) print('Finished Training') print("") print("")