def train_pipe(args, part='parameters'): torch.manual_seed(args.seed) deepspeed.runtime.utils.set_random_seed(args.seed) # # Build the model # # VGG also works :-) #net = vgg19(num_classes=10) net = AlexNet(num_classes=10) net = PipelineModule(layers=join_layers(net), loss_fn=torch.nn.CrossEntropyLoss(), num_stages=args.pipeline_parallel_size, partition_method=part, activation_checkpoint_interval=0) trainset = cifar_trainset(args.local_rank) engine, _, _, _ = deepspeed.initialize( args=args, model=net, model_parameters=[p for p in net.parameters() if p.requires_grad], training_data=trainset) for step in range(args.steps): loss = engine.train_batch()
def pytorch_cos(): model = AlexNet(num_classes=2) optimizer = optim.SGD(params=model.parameters(), lr=0.0001) epoch = 100 len_loader = 100 scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=2, T_mult=2, eta_min=1e-6, last_epoch=-1) plt.figure() x = [] y = [] for e in range(epoch): for i in range(len_loader): step = e + i / len_loader scheduler.step(step) lr = scheduler.get_last_lr()[0] x.append(step) y.append(lr) plt.plot(x, y) plt.xticks(np.arange(0, epoch + 1, 4)) plt.show()
def get_model(device=None): # 加载CNN模型 model = AlexNet(num_classes=2) model.load_state_dict( torch.load('./models/best_linear_svm_alexnet_car.pth')) model.eval() # 取消梯度追踪 for param in model.parameters(): param.requires_grad = False if device: model = model.to(device) return model
def train_base(args): torch.manual_seed(args.seed) # VGG also works :-) #net = vgg19(num_classes=10) net = AlexNet(num_classes=10) trainset = cifar_trainset(args.local_rank) engine, _, dataloader, __ = deepspeed.initialize( args=args, model=net, model_parameters=[p for p in net.parameters() if p.requires_grad], training_data=trainset) dataloader = RepeatingLoader(dataloader) data_iter = iter(dataloader) rank = dist.get_rank() gas = engine.gradient_accumulation_steps() criterion = torch.nn.CrossEntropyLoss() total_steps = args.steps * engine.gradient_accumulation_steps() step = 0 for micro_step in range(total_steps): batch = next(data_iter) inputs = batch[0].to(engine.device) labels = batch[1].to(engine.device) outputs = engine(inputs) loss = criterion(outputs, labels) engine.backward(loss) engine.step() if micro_step % engine.gradient_accumulation_steps() == 0: step += 1 if rank == 0 and (step % 10 == 0): print(f'step: {step:3d} / {args.steps:3d} loss: {loss}')
def train(): torch.multiprocessing.freeze_support() traindir = os.path.join('./200508_cat_classification/dogs-vs-cats', 'train') #경로를 병합함 . testdir = os.path.join('./200508_cat_classification/dogs-vs-cats', 'test') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_loader = datautil.DataLoader(TrainImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])), batch_size=4, shuffle=True, num_workers=4, pin_memory=True) test_loader = datautil.DataLoader(TestImageFolder( testdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=1, shuffle=False, num_workers=1, pin_memory=False) net = AlexNet() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") net = net.to(device) load_model(net, './alexnet.pth') if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") net = nn.DataParallel(net) if torch.cuda.is_available(): net.cuda() import torch.optim as optim criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=0.004) for epoch in range(3): running_loss = 0.0 acc = 0. correct = 0 for i, data in enumerate(train_loader, 0): inputs, labels = data inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda()) optimizer.zero_grad() outputs = net(inputs) #print(outputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.data.item() prediction = torch.max(outputs.data, 1)[1] correct += prediction.eq( labels.data.view_as(prediction)).cpu().sum() if i % 2000 == 1999: total = (i + 1) * 4 print( f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.6f} acc : {correct} / {total}' ) running_loss = 0.0 print('Finished Training') save_model(net, './')
from torchvision.models import AlexNet import matplotlib.pyplot as plt from utils.lr_scheduler import CosineAnnealingWarmupRestarts def plot(lr_list): f = plt.figure() plt.plot(lr_list) plt.show() epochs = 200 iterations = 100 model = AlexNet() optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5) # scheduler = lr_scheduler.CosineAnnealingLR(optimizer, epochs, eta_min=1e-4, last_epoch=-1) # scheduler = lr_scheduler.CosineAnnealingLR(optimizer, iterations, eta_min=1e-4, last_epoch=-1) scheduler = CosineAnnealingWarmupRestarts(optimizer, first_cycle_steps=iterations, cycle_mult=0.5, max_lr=0.1, min_lr=0.0, warmup_steps=1, gamma=0.5) # this zero gradient update is needed to avoid a warning message, issue #8. optimizer.zero_grad() lr_list = list() for epoch in range(epochs): optimizer.step()
mode='triangular'): scheduler = CyclicLR(self.optimizer, base_lr=base_lr, max_lr=max_lr, step_size_up=step_size_up, step_size_down=step_size_down, mode=mode) return scheduler def adjust(self, base_lr, type): pass if __name__ == '__main__': net = AlexNet(num_classes=2) optimizer = SGD(net.parameters(), lr=0.0003) adj = AdjustLr(optimizer) sch1 = adj.LambdaLR_(milestone=5, gamma=0.92) epoches = 40 plt.figure() x1 = list(range(epoches)) y1 = list() lr = optimizer.param_groups[0]['lr'] for epoch in range(epoches): optimizer.step() sch1.step(epoch) a = sch1.get_lr() print(epoch, a) y1.append(a) plt.plot(x1, y1)
transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform) deepspeed.init_distributed() net = AlexNet(num_classes=10) net = PipelineModule(layers=join_layers(net), loss_fn=torch.nn.CrossEntropyLoss(), num_stages=2, partition_method="parameters", activation_checkpoint_interval=0) args = add_argument() engine, optimizer, trainloader, __ = deepspeed.initialize( args=args, model=net, model_parameters=[p for p in net.parameters() if p.requires_grad], training_data=trainset) for step in range(steps): loss = engine.train_batch() print(loss) # deepspeed --hostfile=./hostfile model_parallel/deepspeed/tutorial.py --deepspeed --deepspeed_config model_parallel/deepspeed/ds_config.json
nn.Linear(4096, num_classes), ) def forward(self, x): x = self.features(x) x = x.view(x.size(0), 256 * 6 * 6) return self.classifier(x) model = AlexNet(10).to(device) h1 = hl.build_graph(model, torch.zeros(64, 3, 224, 224).to(device)) h1.save('images/alexnet.png', format='png') # Loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) def update_lr(optimizer, lr): """For updating learning rate.""" for param_group in optimizer.param_groups: param_group['lr'] = lr # Train the model total_step = len(train_loader) curr_lr = learning_rate for epoch in range(num_epochs): for i, (images, labels) in enumerate(train_loader): images = images.to(device)
import torch import torch.optim as optim from torch.optim import lr_scheduler from torchvision.models import AlexNet import matplotlib.pyplot as plt model = AlexNet(num_classes=2) optimizer = optim.SGD(params=model.parameters(), lr=0.01) def f_step(): scheduler = lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.98) x = list(range(100)) y = [] for epoch in range(100): scheduler.step() lr = scheduler.get_lr()[0] print(epoch, lr) y.append(lr) return x, y def f_multistep(): scheduler = lr_scheduler.MultiStepLR(optimizer, [30, 80], gamma=0.98) x = list(range(100)) y = [] for epoch in range(100): scheduler.step() lr = scheduler.get_lr()[0] print(epoch, lr)
from torch.autograd import Variable import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import torchvision as tv import torchvision.transforms as transforms # Tensor的读取与保存 a = torch.Tensor(3, 4) a.cuda() torch.save(a, 'a.pth') b = torch.load('a.pth') c = torch.load('a.pth', map_location=lambda sto, loc: sto) # ---------------------------------------------------------- torch.set_default_tensor_type('torch.FloatTensor') from torchvision.models import AlexNet model = AlexNet() model.state_dict().keys() # model的保存与加载 torch.save(model.state_dict(), 'alexnet.pth') model.load_state_dict(torch.load('alexnet.pth')) opt = torch.optim.Adam(model.parameters(), lr=0.1) # 优化器的参数读取与保存 torch.save(opt.state_dict(), 'opt.pth') opt.load_state_dict(torch.load('opt.pth'))
torch.save( { 'epoch': self.T_max, 'state_dict': self.model.state_dict() }, self.out_dir + "Weight/" + 'snapshot_e_{:03d}.pth.tar'.format(self.T_max)) ## reset epochs since the last reset self.current_epoch = 0 ## reset the next goal self.Te = int(self.Te * self.T_mult) self.T_max = self.T_max + self.Te if __name__ == '__main__': from torchvision.models import AlexNet model = AlexNet(num_classes=2) optimizer = torch.optim.SGD(params=model.parameters(), lr=0.1) # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=2) scheduler = CosineAnnealingLR_with_Restart(optimizer, T_max=4, T_mult=1, model=model, out_dir='./', take_snapshot=False) for epoch in range(20): lr = scheduler.get_lr() print(lr) scheduler.step()
os.makedirs(cpkRoot, exist_ok=True) filePath = os.path.join(cpkRoot, fileName) bestPath = os.path.join(cpkRoot, 'model_best.pth.tar') if isBest: shutil.copyfile(filePath, bestPath) return torch.save(state, filePath) if __name__ == '__main__': from easydict import EasyDict as edict args = edict() args.lr = 0.2 args.nEpochs = 40 args.power = 0.9 model = AlexNet(num_classes=2) optimizer = optim.SGD(params=model.parameters(), lr=args.lr) # scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=0.2) plt.figure() x = list(range(10)) y = [] for epoch in range(10): # scheduler.step(epoch) # y.append(scheduler.get_lr()[0]) adjust_learning_rate(optimizer, epoch, args) y.append(optimizer.param_groups[0]['lr']) plt.plot(x, y) plt.show()
plt.ylabel('Cross entropy Loss') plt.legend() plt.show() def plot_data(exp_id): epochs, lrs, train_err, val_err, train_loss, val_loss = load_experiment(exp_id) plot_lr_data(epochs, lrs) plot_err_data(epochs, train_err, val_err) plot_loss_data(epochs, train_loss, val_loss) if __name__ == '__main__': plot_data(8) exit(1) from torchvision.models import AlexNet model = AlexNet() optimizer = optim.SGD(model.parameters(), lr=0.1) scheduler = CosineWithRestartLR( optimizer, min_lr=1e-4, max_lr=0.1, restart_interval=10, restart_multiplier=2, amplitude_decay=1 ) # scheduler = AdaptiveLR( # optimizer, # start_lr = 0.01, # mu=0.99, # eps=0.1, # last_epoch=-1 # )