def train(model, device, train_loader, optimizer, epoch): """Train for one epoch on the training set""" losses = AverageMeter() top1 = AverageMeter() # switch to train mode model.train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) # compute output output = model(data) loss = F.nll_loss(output, target) # measure accuracy and record loss prec1 = accuracy(output, target, topk=(1,))[0] losses.update(loss.item(), data.size(0)) top1.update(prec1.item(), data.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() if batch_idx % args.print_freq == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format( epoch, batch_idx, len(train_loader), loss=losses, top1=top1))
def test(model, device, test_loader, epoch): losses = AverageMeter() top1 = AverageMeter() # switch to evaluate mode model.eval() for batch_idx, (data, target) in enumerate(test_loader): data, target = data.to(device), target.to(device) # compute output with torch.no_grad(): output = model(data) loss = F.nll_loss(output, target) # measure accuracy and record loss prec1 = accuracy(output, target, topk=(1,))[0] losses.update(loss.item(), data.size(0)) top1.update(prec1.item(), data.size(0)) if batch_idx % args.print_freq == 0: print('Test: [{0}/{1}]\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format( batch_idx, len(test_loader), loss=losses, top1=top1)) print(' * Prec@1 {top1.avg:.3f}'.format(top1=top1)) return top1.avg
def train(epoch): model.train() train_loss = 0 start_time = datetime.datetime.now() prefix = 'vanila' for batch_idx, (data, labels) in enumerate(train_loader): data = data.to(device) # print(labels) labels = classes_to_one_hot(labels) # print(labels) labels = labels.to(device) optimizer.zero_grad() recon_batch, mu, logvar = model(data) loss = loss_function(recon_batch, data, mu, logvar) loss.backward() train_loss += loss.item() optimizer.step() if batch_idx % args.log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item() / len(data))) print('====> Epoch: {} Average loss: {:.4f}'.format( epoch, train_loss / len(train_loader.dataset))) torch.save(model, f'checkpoints/{prefix}_{str(start_time)}_{epoch}.pt')
def test(model, test_loader, device): model.eval() test_loss = 0 correct = 0 with torch.no_grad(): for data, target in test_loader: data, target = data.to(device), target.to(device) output = model(data) test_loss += F.nll_loss(output, target, size_average=False).item() # sum up batch loss pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability correct += pred.eq(target.view_as(pred)).sum().item() test_loss /= len(test_loader.dataset) logger.info('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset)))
def test(epoch): model.eval() test_loss = 0 with torch.no_grad(): for i, (data, _) in enumerate(test_loader): data = data.to(device) recon_batch, mu, logvar = model(data) test_loss += loss_function(recon_batch, data, mu, logvar).item() if i == 0: n = min(data.size(0), 8) comparison = torch.cat([data[:n], recon_batch.view(args.batch_size, 1, 28, 28)[:n]]) save_image(comparison.cpu(), 'results/reconstruction_' + str(epoch) + '.png', nrow=n) test_loss /= len(test_loader.dataset) print('====> Test set loss: {:.4f}'.format(test_loss))
def train_transient(model, device, train_loader, optimizer, epoch, track=False): """Train for one epoch on the training set""" losses = AverageMeter() top1 = AverageMeter() # switch to train mode model.train() epoch_stats = [] for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) # compute output output = model(data) losses_ = F.nll_loss(output, target, reduction='none') if track: indices = [batch_idx*train_loader.batch_size + i for i in range(len(data))] batch_stats = [] for i, l in zip(indices, losses_): batch_stats.append([i, l.item()]) epoch_stats.append(batch_stats) loss = losses_.mean() # measure accuracy and record loss prec1 = accuracy(output, target, topk=(1,))[0] losses.update(loss.item(), data.size(0)) top1.update(prec1.item(), data.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() if batch_idx % args.print_freq == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format( epoch, batch_idx, len(train_loader), loss=losses, top1=top1)) if track: return epoch_stats return None
def train(epoch): model.train() train_loss = 0 for batch_idx, (data, _) in enumerate(train_loader): data = data.to(device) optimizer.zero_grad() recon_batch, mu, logvar = model(data) loss = loss_function(recon_batch, data, mu, logvar) loss.backward() train_loss += loss.item() optimizer.step() if batch_idx % args.log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item() / len(data))) print('====> Epoch: {} Average loss: {:.4f}'.format( epoch, train_loss / len(train_loader.dataset)))
num_workers=4) seed_everything(42) n_epochs = 30 for epoch in tqdm(range(1, n_epochs + 1)): # monitor training loss train_loss = 0.0 ################### # train the model # ################### for data in train_loader: # _ stands in for labels, here # no need to flatten images data = data.to(device) # clear the gradients of all optimized variables optimizer.zero_grad() # forward pass: compute predicted outputs by passing inputs to the model outputs = model(data) # calculate the loss loss = criterion(outputs, data) #print(loss.item()) # backward pass: compute gradient of the loss with respect to model parameters loss.backward() # perform a single optimization step (parameter update) optimizer.step() # update running training loss train_loss += loss.item() * data.size(0) # print avg training statistics
def one_train(loader, model, criterion, optimizer, epoch): print("LOG : training phase , epoch = ", epoch) # 各値初期化 cos_losses = AverageMeter() if opts.semantic_reg: img_losses = AverageMeter() rec_losses = AverageMeter() data_num = len(loader.dataset) # テストデータの総数 pbar = tqdm(total=int(data_num / opts.batch_size)) # プログレスバー設定 # 学習開始 model.train() # モデルを学習モードに設定 for batch, (inputs, targets) in enumerate(loader): # データをdeviceに載せる (image, inst, len(inst), ingr, len(ingr)), [target, img_class, rec_class] input_var = [data.to(DEVICE, non_blocking=True) for data in inputs] target_var = [data.to(DEVICE, non_blocking=True) for data in targets] outputs = model(input_var[0], input_var[1], input_var[2], input_var[3], input_var[4]) # モデルから出力を得る # Lossの計算 カテゴリ分類のあるなしで場合分け if SEMANTIC_REG: cos_loss = criterion[0](outputs[0], outputs[1], target_var[0].float()) img_loss = criterion[1](outputs[2], target_var[1]) rec_loss = criterion[1](outputs[3], target_var[2]) # combined loss loss = opts.cos_weight * cos_loss +\ opts.cls_weight * img_loss +\ opts.cls_weight * rec_loss # measure performance and record losses cos_losses.update(cos_loss.data, inputs[0].size(0)) img_losses.update(img_loss.data, inputs[0].size(0)) rec_losses.update(rec_loss.data, inputs[0].size(0)) else: loss = criterion(outputs[0], outputs[1], target_var[0]) # measure performance and record loss cos_losses.update(loss.data[0], inputs[0].size(0)) optimizer.zero_grad() # 勾配の初期化 loss.backward() # 勾配の計算 optimizer.step() # パラメータの更新 pbar.update(1) pbar.close() if opts.semantic_reg: print('Epoch: {0}\t' 'cos loss:{cos_loss.val:.4f} ({cos_loss.avg:.4f}) ' 'img Loss:{img_loss.val:.4f} ({img_loss.avg:.4f}) ' 'rec loss:{rec_loss.val:.4f} ({rec_loss.avg:.4f}) ' 'vision_lr:({visionLR})-recipe_lr:({recipeLR}) '.format( epoch, cos_loss=cos_losses, img_loss=img_losses, rec_loss=rec_losses, visionLR=optimizer.param_groups[1]['lr'], recipeLR=optimizer.param_groups[0]['lr'])) else: print('Epoch: {0}\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'vision ({visionLR}) - recipe ({recipeLR})\t'.format( epoch, loss=cos_losses, visionLR=optimizer.param_groups[1]['lr'], recipeLR=optimizer.param_groups[0]['lr'])) return cos_losses.val, img_losses.val, rec_losses.val
def test(model, device, test_loader): model.eval() test_loss = 0 correct = 0 result = {} cur = 0 total = len(test_loader.dataset) # total = 900 with torch.no_grad(): for data, target, img_path in test_loader: if cur >= total: break data, target = data.to(device), target.to(device, dtype=torch.int64) output = model(data) loss1 = F.nll_loss(output[0], target) loss2 = F.nll_loss(output[1], target) loss3 = F.nll_loss(output[2], target) loss = loss1 + loss2 + 0.1 * loss3 test_loss += loss # sum up batch loss output_merge = output[0] * 0.5 + output[1] * 0.4 + output[2] * 0.1 pred = output_merge.argmax( dim=1, keepdim=True) # get the index of the max log-probability # correct += pred.eq(target.view_as(pred)).sum().item() pred = pred.view(pred.shape[0]) # pred.shape = [batch_size] for i in range(len(pred)): if img_path[i] not in result: result[img_path[i]] = [0, 0] result[img_path[i]][0] += target[i].item() # [label, res] result[img_path[i]][1] += pred[i].item() # [label, res] print(len(result)) del loss1, loss2, loss3, loss cur += 30 # 1. 计算correct # def isTrue(a): # target = a[0] # res = a[1] # if (target + res == 0) or (target > 0 and res > 0): # target 只有0/3 res 有0/1/2/3 # return 1 # return 0 # print(len(result)) # new_result = list(map(isTrue, result.values())) # correct = sum(new_result) # 2. 函数计算准确率 召回率 def Accuuacy(res): TP = 0 # 检测为篡改 而且是真的 TN = 0 # 检测为原始 真的 FP = 0 FN = 0 for key, value in res.items(): if value[1] == 0: if value[0] == 0: TN += 1 else: FN += 1 else: if value[0] == 0: FP += 1 else: TP += 1 return TP, TN, FP, FN TP, TN, FP, FN = Accuuacy(result) accuracy = (TP + TN) / (TP + TN + FP + FN) precision = TP / (TP + FP) recall = TP / (TP + FN) print(TP, TN, FP, FN) print(accuracy, precision, recall) test_loss /= total # print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( # test_loss, correct, len(test_loader.dataset), # 100. * correct / len(test_loader.dataset))) print( '\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, TP + TN, total / 3, 300. * (TP + TN) / total)) print('--- Accuracy: {} -- Precision: {} --- Reacll: {}'.format( accuracy, precision, recall)) with open("accuracy.txt", "a+") as file: file.write( '--- Accuracy: {} -- Precision: {} --- Reacll: {}\n'.format( accuracy, precision, recall))
def train_epoch(self,train_loader,task_id): self.adjust_learning_rate(self.epoch) self.model.train() if task_id > 0 and self.args.experiment.xai_memory: for idx, (data, target, sal, tt, _) in enumerate(self.saliency_loaders): x = data.to(device=self.device, dtype=torch.float) s = sal.to(device=self.device, dtype=torch.float) explanations, self.model , _, _ = self.explainer(x, self.model, task_id) self.saliency_size = explanations.size() # To make predicted explanations (Bx7x7) same as ground truth ones (Bx1x7x7) sal_loss = self.sal_loss(explanations.view_as(s), s) sal_loss *= self.args.saliency.regularizer if self.args.wandb.log: wandb.log({"Saliency loss": sal_loss.item()}) try: sal_loss.requires_grad = True except: continue self.optimizer_explanations.zero_grad() sal_loss.backward(retain_graph=True) self.optimizer_explanations.step() # Loop batches for batch_idx, (x, y, tt) in enumerate(train_loader): images = x.to(device=self.device, dtype=torch.float) targets = y.to(device=self.device, dtype=torch.long) tt = tt.to(device=self.device, dtype=torch.long) # Forward if self.args.architecture.multi_head: output=self.model.forward(images, tt) else: output = self.model.forward(images) loss=self.criterion(output,targets) # L1 regularize if self.args.train.l1_reg: reg_loss = self.l1_regularizer() factor = self.args.train.l1_reg_factor loss += factor * reg_loss loss *= self.args.train.task_loss_reg # Backward self.optimizer.zero_grad() loss.backward(retain_graph=True) # Apply step # torch.nn.utils.clip_grad_norm_(self.model.parameters(),self.clipgrad) self.optimizer.step()
def to_device(data, device): """Move tensor(s) to chosen device""" if isinstance(data, (list,tuple)): return [to_device(x, device) for x in data] return data.to(device, non_blocking=True)
def cluster_for_instance(dataloader, args): use_cuda = not args.ngpu and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # Extracts scene features from the entire image arch = 'resnet18' model_file = '%s_places365.pth.tar' % arch model = models.__dict__[arch](num_classes=365).to(device) checkpoint = torch.load(model_file, map_location=lambda storage, loc: storage) state_dict = { str.replace(k, 'module.', ''): v for k, v in checkpoint['state_dict'].items() } model.load_state_dict(state_dict) model.eval() scene_classifier = model.fc new_classifier = nn.Sequential() model.fc = new_classifier categories = dataloader.dataset.categories scene_features = [[[], []] for i in range(len(categories))] instance_features = [[[], []] for i in range(len(categories))] scene_filepaths = [[[], []] for i in range(len(categories))] # Extracts features of just the cropped object model_file = 'cifar_resnet110.th' small_model = resnet110() checkpoint = torch.load(model_file, map_location=lambda storage, loc: storage) state_dict = { str.replace(k, 'module.', ''): v for k, v in checkpoint['state_dict'].items() } small_model.load_state_dict(state_dict) small_model.to(device) small_model.eval() normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) for i, (data, target) in enumerate(tqdm(dataloader)): gender = target[1] anns = target[0] if len(gender) > 1: data.to(device) data = normalize(data) big_data = F.interpolate(data.unsqueeze(0), size=224, mode='bilinear').to(device) this_features = model.forward(big_data) logit = scene_classifier.forward(this_features) h_x = F.softmax(logit, 1).data.squeeze() probs, idx = h_x.sort(0, True) pred = idx[0] size = list(data.size())[1:] scene_added = [] for ann in anns: index = categories.index(ann['label']) bbox = np.array([ ann['bbox'][0] * size[1], ann['bbox'][1] * size[1], ann['bbox'][2] * size[0], ann['bbox'][3] * size[0] ]).astype(int) instance = data[:, bbox[2]:bbox[3], bbox[0]:bbox[1]] if 0 in list(instance.size()): continue small_data = F.interpolate(instance.unsqueeze(0), size=32, mode='bilinear').to(device) this_small_features = small_model.features(small_data) if len(scene_features[index][ gender[0] - 1]) < 500 and index not in scene_added: scene_added.append(index) scene_features[index][gender[0] - 1].extend( this_features.data.cpu().numpy()) scene_filepaths[index][gender[0] - 1].append( (target[3], pred)) if len(instance_features[index][gender[0] - 1]) < 500: instance_features[index][gender[0] - 1].extend( this_small_features.data.cpu().numpy()) stats = {} stats['instance'] = instance_features stats['scene'] = scene_features stats['scene_filepaths'] = scene_filepaths pickle.dump(stats, open("results/{}/4.pkl".format(args.folder), "wb"))
def train(args): is_distributed = len(args.hosts) > 1 and args.backend is not None logger.debug("Distributed training - {}".format(is_distributed)) use_cuda = (args.processor == 'gpu') or (args.num_gpus > 0) logger.debug("Number of gpus available - {}".format(args.num_gpus)) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} device = torch.device("cuda" if use_cuda else "cpu") if is_distributed: # Initialize the distributed environment. world_size = len(args.hosts) os.environ['WORLD_SIZE'] = str(world_size) host_rank = args.hosts.index(args.current_host) os.environ['RANK'] = str(host_rank) dist.init_process_group(backend=args.backend, rank=host_rank, world_size=world_size) logger.info( 'Initialized the distributed environment: \'{}\' backend on {} nodes. ' .format(args.backend, dist.get_world_size()) + 'Current host rank is {}. Number of gpus: {}'.format( dist.get_rank(), args.num_gpus)) # set the seed for generating random numbers torch.manual_seed(args.seed) if use_cuda: torch.cuda.manual_seed(args.seed) train_loader = _get_train_data_loader(args.batch_size, args.data_dir, is_distributed, **kwargs) test_loader = _get_test_data_loader(args.test_batch_size, args.data_dir, **kwargs) # TODO: assert the logs when we move to the SDK local mode logger.debug("Processes {}/{} ({:.0f}%) of train data".format( len(train_loader.sampler), len(train_loader.dataset), 100. * len(train_loader.sampler) / len(train_loader.dataset))) logger.debug("Processes {}/{} ({:.0f}%) of test data".format( len(test_loader.sampler), len(test_loader.dataset), 100. * len(test_loader.sampler) / len(test_loader.dataset))) model = Net() if is_distributed and use_cuda: # multi-machine multi-gpu case logger.debug("Multi-machine multi-gpu: using DistributedDataParallel.") # establish host rank and set device on this node torch.cuda.set_device(host_rank) model.cuda(host_rank) # for multiprocessing distributed, the DDP constructor should always set # the single device scope. otherwise, DDP will use all available devices. model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[host_rank], output_device=host_rank) elif use_cuda: # single-machine multi-gpu case logger.debug("Single-machine multi-gpu: using DataParallel().cuda().") model = model.to(device) model = torch.nn.DataParallel(model).to(device) else: # single-machine or multi-machine cpu case logger.debug("Single-machine/multi-machine cpu: using DataParallel.") model = model.to(device) model = torch.nn.DataParallel(model) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(1, args.epochs + 1): model.train() for batch_idx, (data, target) in enumerate(train_loader, 1): if is_distributed and use_cuda: # multi-machine multi-gpu case - allow asynchrous GPU copies of the data data, target = data.cuda(non_blocking=True), target.cuda( non_blocking=True) else: data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() if is_distributed and not use_cuda: # average gradients manually for multi-machine cpu case only _average_gradients(model) optimizer.step() if batch_idx % args.log_interval == 0: logger.debug( 'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.sampler), 100. * batch_idx / len(train_loader), loss.item())) test(model, test_loader, device) save_model(model, args.model_dir) if is_distributed and host_rank == 0 or not is_distributed: assert_can_track_sagemaker_experiments()
def eval_epoch(self, epoch): #model_pth = '%s/model_epoch_%04d.pth' % (osp.join(self.save_path, 'models'), epoch) #self.model.load_state_dict(torch.load(model_pth)) # do these two impactful? I don't know self.model.eval() if not os.path.exists(self.csv_path): os.mkdir(self.csv_path) eval_csv = os.path.join(self.csv_path, 'eval.csv') pred_list, target_list, loss_list, pos_list = [], [], [], [] for batch_idx, item in enumerate(self.val_loader): if self.cfig['model_name'] in ['disrnn', 'trnn']: data, target, dist = item data, target, dist = data.to(self.device), target.to( self.device), dist.to(self.device) if batch_idx == 0: print(dist.shape) else: data, target, ID = item data, target = data.to(self.device), target.to(self.device) if self.cfig['model_name'][-3:] == 'rnn': data = data.permute([1, 0, 2, 3, 4]) #data = pack_padded_sequence(data, [3] * self.cfig['batch_size']) # if use cell, we don't need this self.optim.zero_grad() if self.cfig['model_name'] in ['disrnn', 'trnn']: pred = self.model(data, dist) else: pred = self.model(data) pred_prob = F.softmax(pred) #loss = self.criterion(pred, target) loss = nn.CrossEntropyLoss()(pred, target) pred_cls = pred.data.max(1)[1] # not test yet pos_list += pred_prob[:, 1].data.cpu().numpy().tolist() pred_list += pred_cls.data.cpu().numpy().tolist() target_list += target.data.cpu().numpy().tolist() loss_list.append(loss.data.cpu().numpy().tolist()) accuracy = accuracy_score(target_list, pred_list) print(confusion_matrix(target_list, pred_list)) fpr, tpr, threshold = metrics.roc_curve(target_list, pos_list) roc_auc = metrics.auc(fpr, tpr) #-------------------------save to csv -----------------------# if not os.path.exists(eval_csv): csv_info = ['epoch', 'loss', 'auc', 'accuracy'] init_csv = pd.DataFrame() for key in csv_info: init_csv[key] = [] init_csv.to_csv(eval_csv) df = pd.read_csv(eval_csv) data = pd.DataFrame() tmp_epoch = df['epoch'].tolist() tmp_epoch.append(epoch) #print ('------------------', tmp_epoch) tmp_loss = df['loss'].tolist() tmp_loss.append(np.mean(loss_list)) tmp_auc = df['auc'].tolist() tmp_auc.append(roc_auc) tmp_acc = df['accuracy'].tolist() tmp_acc.append(accuracy) data['epoch'], data['loss'], data['auc'], data[ 'accuracy'] = tmp_epoch, tmp_loss, tmp_auc, tmp_acc data.to_csv(eval_csv) print('val accuracy: ', accuracy, 'val auc: ', roc_auc) print('max val auc at: ', max(tmp_auc), tmp_auc.index(max(tmp_auc)))
real_label = 1 fake_label = 0 # setup optimizer optimizerD = optim.Adam(netD.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) optimizerG = optim.Adam(netG.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) for epoch in range(opt.niter): for i, data_map in enumerate(dataloader, 0): data = data_map['image'] encodings = data_map['encoding'] # (1) Update D network: maximize log(D(x, h)) + (log(1 - D(G(z, h)) + log(1 - D(z, h')))/2 # train with real image right caption netD.zero_grad() real_cpu = data.to(device) batch_size = real_cpu.size(0) label = torch.full((batch_size, ), real_label, device=device) output = netD(real_cpu, encodings) errD_real = criterion(output, label) errD_real.backward() D_x = output.mean().item() # real image wrong caption noise = torch.randn(batch_size, nz, 1, 1, device=device) encoded_noise = torch.randn(batch_size, 4800, device=device) label.fill_(fake_label) output = netD(real_cpu, encoded_noise) errD_real_h = criterion(output, label)
def predict_fn(input_data, model): print('Inferring sentiment of input data.') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if model.word_dict is None: raise Exception('Model has not been loaded properly, no word_dict.') # TODO: Process input_data so that it is ready to be sent to our model. # You should produce two variables: # data_X - A sequence of length 500 which represents the converted review # data_len - The length of the review data_X = None data_len = None input_data_words = review_to_words(input_data) data_X, data_len = convert_and_pad(model.word_dict, input_data_words) # Using data_X and data_len we construct an appropriate input tensor. Remember # that our model expects input data of the form 'len, review[500]'. data_pack = np.hstack((data_len, data_X)) data_pack = data_pack.reshape(1, -1) data = torch.from_numpy(data_pack) data = data.to(device) # Make sure to put the model into evaluation mode model.eval() # TODO: Compute the result of applying the model to the input data. The variable `result` should # be a numpy array which contains a single integer which is either 1 or 0 with torch.no_grad(): output = model.forward(data) # Move the result to cpu (this is a must if GPU was used) output = output.cpu() result = int(np.round(output.numpy())) # result = predictor.predict(data.values) print(result) return result # def review_to_words(review): # nltk.download("stopwords", quiet=True) # stemmer = PorterStemmer() # text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags # text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case # words = text.split() # Split string into words # words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords # words = [PorterStemmer().stem(w) for w in words] # stem # return words # def convert_and_pad(word_dict, sentence, pad=500): # NOWORD = 0 # We will use 0 to represent the 'no word' category # INFREQ = 1 # and we use 1 to represent the infrequent words, i.e., words not appearing in word_dict # working_sentence = [NOWORD] * pad # for word_index, word in enumerate(sentence[:pad]): # if word in word_dict: # working_sentence[word_index] = word_dict[word] # else: # working_sentence[word_index] = INFREQ # return working_sentence, min(len(sentence), pad) # def convert_and_pad_data(word_dict, data, pad=500): # result = [] # lengths = [] # for sentence in data: # converted, leng = convert_and_pad(word_dict, sentence, pad) # result.append(converted) # lengths.append(leng) # return np.array(result), np.array(lengths)
def train(n_epochs, loaders, model, optimizer, criterion, use_cuda, save_dir, save_file): """ This is the training method that is called by the PyTorch training script. The parameters passed are as follows: model - The PyTorch model that we wish to train. train_loader - The PyTorch DataLoader that should be used during training. epochs - The total number of epochs to train for. criterion - The loss function used for training. optimizer - The optimizer to use during training. device - Where the model and data should be loaded (gpu or cpu). """ loss_list = [] acc_list = [] # training loop is provided valid_loss_min = np.Inf total_step = len(loaders['Training']) for epoch in range(1, n_epochs + 1): # initialize variables to monitor training and validation loss train_loss = 0.0 valid_loss = 0.0 print('Started epoch') ################### # train the model # ################### model.train() for batch_idx, (data, target) in enumerate(loaders['Training']): # move to GPU if use_cuda: data, target = data.cuda(), target.cuda() data, target = data.to(device), target.to(device) ## find the loss and update the model parameters accordingly ## record the average training loss, using something like optimizer.zero_grad() # Get output output = model(data) # Calculate loss loss = criterion(output, target) loss.backward() optimizer.step() train_loss = train_loss + (1 / (batch_idx + 1)) * (loss.data - train_loss) # Track the accuracy total = target.size(0) _, predicted = torch.max(output.data, 1) correct = (predicted == target).sum().item() acc_list.append(correct / total) if (batch_idx + 1) % 100 == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Accuracy: {:.2f}%' .format(epoch + 1, n_epochs, batch_idx + 1, total_step, loss.item(), (correct / total) * 100)) ###################### # validate the model # ###################### model.eval() for batch_idx, (data, target) in enumerate(loaders['Validation']): # move to GPU if use_cuda: data, target = data.cuda(), target.cuda() output = model(data) loss = criterion(output, target) valid_loss = valid_loss + (1 / (batch_idx + 1)) * (loss.data - valid_loss) # print training/validation statistics print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'. format(epoch, train_loss, valid_loss)) ## TODO: save the model if validation loss has decreased if valid_loss <= valid_loss_min: print( 'Saving model: {} \tNew Valid Loss: {:.6f} \tPrevious Valid Loss: {:.6f}' .format(epoch, valid_loss, valid_loss_min)) torch.save(model.state_dict(), os.path.join(save_dir, save_file)) valid_loss_min = valid_loss
def train(epoch): iters = 0 # For each batch in the dataloader stats = adl.Accumulator() for i, data in enumerate(dataloader, 0): data = data[0] ############################ # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z))) ########################### ## Train with all-real batch netD.zero_grad() # Format batch real_cpu = data.to(device) b_size = real_cpu.size(0) label = torch.full((b_size, ), real_label, device=device) # Forward pass real batch through D output = netD(real_cpu).view(-1) # Calculate loss on all-real batch errD_real = criterion(output, label) # Calculate gradients for D in backward pass errD_real.backward() D_x = output.mean().item() ## Train with all-fake batch # Generate batch of latent vectors noise = torch.randn(b_size, nz, 1, 1, device=device) # Generate fake image batch with G fake = netG(noise) label.fill_(fake_label) # Classify all fake batch with D output = netD(fake.detach()).view(-1) # Calculate D's loss on the all-fake batch errD_fake = criterion(output, label) # Calculate the gradients for this batch errD_fake.backward() D_G_z1 = output.mean().item() # Add the gradients from the all-real and all-fake batches errD = errD_real + errD_fake # Update D optimizerD.step() ############################ # (2) Update G network: maximize log(D(G(z))) ########################### netG.zero_grad() label.fill_(real_label) # fake labels are real for generator cost # Since we just updated D, perform another forward pass of all-fake batch through D output = netD(fake).view(-1) # Calculate G's loss based on this output errG = criterion(output, label) # Calculate gradients for G errG.backward() D_G_z2 = output.mean().item() # Update G optimizerG.step() # Save Losses for plotting later G_losses.append(errG.item()) D_losses.append(errD.item()) stats["g_loss_sum"] += errG.item() stats["d_loss_sum"] += errD.item() stats["norm"] += metrics._metrics_state().grad_params[0] stats["var"] += metrics._metrics_state().grad_params[1] stats["replicas"] += 1.0 scheduleD.step() scheduleG.step() with stats.synchronized(): with SummaryWriter(adaptdl.get_tensorboard_dir()) as writer: writer.add_scalar("Loss/G", stats["g_loss_sum"] / stats["replicas"], epoch) writer.add_scalar("Loss/D", stats["d_loss_sum"] / stats["replicas"], epoch) writer.add_scalar("Performance/GlobalBatchsize", b_size * stats["replicas"], epoch) writer.add_scalar("Performance/Replicas", stats["replicas"], epoch) writer.add_scalar("Stats/Variance", stats["norm"] / stats["replicas"], epoch) writer.add_scalar("Stats/Norm", stats["var"] / stats["replicas"], epoch)
def train(epoch, model, train_loader, device, optimizer, args, writer): model.train() train_loss = 0 for batch_idx, (data, _) in enumerate(train_loader): data = data.to(device) optimizer.zero_grad() storch.reset() # Denote the minibatch dimension as being independent data = storch.denote_independent(data.view(-1, 784), 0, "data") recon_batch, KLD, z = model(data) storch.add_cost(loss_function(recon_batch, data), "reconstruction") cost = backward() train_loss += cost.item() optimizer.step() cond_log = batch_idx % args.log_interval == 0 if cond_log: step = 100.0 * batch_idx / len(train_loader) global_step = 100 * (epoch - 1) + step # Variance of expect method is 0 by definition. variances = {} if args.method != "expect" and args.variance_samples > 1: _consider_param = "probs" if args.latents < 3: old_method = model.sampling_method model.sampling_method = Expect("z") optimizer.zero_grad() recon_batch, _, z = model(data) storch.add_cost(loss_function(recon_batch, data), "reconstruction") backward() expect_grad = storch.reduce_plates( z.grad[_consider_param]).detach_tensor() optimizer.zero_grad() model.sampling_method = old_method grads = {n: [] for n in z.grad} for i in range(args.variance_samples): optimizer.zero_grad() recon_batch, _, z = model(data) storch.add_cost(loss_function(recon_batch, data), "reconstruction") backward() for param_name, grad in z.grad.items(): # Make sure to reduce the data dimension and detach, for memory reasons. grads[param_name].append( storch.reduce_plates(grad).detach_tensor()) variances = {} for param_name, gradz in grads.items(): # Create a new independent dimension for the different gradient samples grad_samples = storch.gather_samples(gradz, "variance") # Compute the variance over this independent dimension variances[param_name] = storch.variance( grad_samples, "variance")._tensor if param_name == _consider_param and args.latents < 3: mean = storch.reduce_plates(grad_samples, "variance") mse = storch.reduce_plates( (grad_samples - expect_grad)**2).sum() bias = (storch.reduce_plates( (mean - expect_grad)**2)).sum() print("mse", mse._tensor.item()) # Should approach 0 when increasing variance_samples for unbiased estimators. print("bias", bias._tensor.item()) writer.add_scalar("train/probs_bias", bias._tensor, global_step) writer.add_scalar("train/probs_mse", mse._tensor, global_step) print( "Train Epoch: {} [{}/{} ({:.0f}%)]\tCost: {:.6f}\t Logits var {}" .format( epoch, batch_idx * len(data), len(train_loader.dataset), step, cost.item(), variances, )) writer.add_scalar("train/ELBO", cost, global_step) for param_name, var in variances.items(): writer.add_scalar("train/variance/" + param_name, var, global_step) avg_train_loss = train_loss / (batch_idx + 1) print("====> Epoch: {} Average loss: {:.4f}".format(epoch, avg_train_loss)) return avg_train_loss
def train(args): is_distributed = len(args.hosts) > 1 and args.backend is not None logger.debug("Distributed training - {}".format(is_distributed)) use_cuda = args.num_gpus > 0 logger.debug("Number of gpus available - {}".format(args.num_gpus)) kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {} device = torch.device("cuda" if use_cuda else "cpu") if is_distributed: # Initialize the distributed environment. world_size = len(args.hosts) os.environ["WORLD_SIZE"] = str(world_size) host_rank = args.hosts.index(args.current_host) os.environ["RANK"] = str(host_rank) dist.init_process_group(backend=args.backend, rank=host_rank, world_size=world_size) logger.info( "Initialized the distributed environment: '{}' backend on {} nodes. " .format(args.backend, dist.get_world_size()) + "Current host rank is {}. Number of gpus: {}".format( dist.get_rank(), args.num_gpus)) # set the seed for generating random numbers torch.manual_seed(args.seed) if use_cuda: torch.cuda.manual_seed(args.seed) train_loader = _get_train_data_loader(args.batch_size, args.data_dir, is_distributed, **kwargs) test_loader = _get_test_data_loader(args.test_batch_size, args.data_dir, **kwargs) logger.debug("Processes {}/{} ({:.0f}%) of train data".format( len(train_loader.sampler), len(train_loader.dataset), 100.0 * len(train_loader.sampler) / len(train_loader.dataset), )) logger.debug("Processes {}/{} ({:.0f}%) of test data".format( len(test_loader.sampler), len(test_loader.dataset), 100.0 * len(test_loader.sampler) / len(test_loader.dataset), )) model = Net().to(device) if is_distributed and use_cuda: # multi-machine multi-gpu case model = torch.nn.parallel.DistributedDataParallel(model) else: # single-machine multi-gpu case or single-machine or multi-machine cpu case model = torch.nn.DataParallel(model) wandb.watch(model) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(1, args.epochs + 1): model.train() for batch_idx, (data, target) in enumerate(train_loader, 1): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() if is_distributed and not use_cuda: # average gradients manually for multi-machine cpu case only _average_gradients(model) optimizer.step() wandb.log({"training/loss": loss.item()}) if batch_idx % args.log_interval == 0: logger.info( "Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}".format( epoch, batch_idx * len(data), len(train_loader.sampler), 100.0 * batch_idx / len(train_loader), loss.item(), )) test(model, test_loader, device) save_model(model, args.model_dir)
def main(args): def get_model_type(model_name): model_type = { 'models/modelA': 0, 'models/modelA_adv': 0, 'models/modelA_ens': 0, 'models/modelB': 1, 'models/modelB_adv': 1, 'models/modelB_ens': 1, 'models/modelC': 2, 'models/modelC_adv': 2, 'models/modelC_ens': 2, 'models/modelD': 3, 'models/modelD_adv': 3, 'models/modelD_ens': 3, } if model_name not in model_type.keys(): raise ValueError('Unknown model: {}'.format(model_name)) return model_type[model_name] torch.manual_seed(args.seed) device = torch.device('cuda' if args.cuda else 'cpu') ''' Preprocess MNIST dataset ''' kwargs = {'num_workers': 20, 'pin_memory': True} if args.cuda else {} train_loader = torch.utils.data.DataLoader(datasets.MNIST( '../attack_mnist', train=True, download=True, transform=transforms.ToTensor()), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.MNIST( '../attack_mnist', train=False, transform=transforms.ToTensor()), batch_size=args.batch_size, shuffle=True, **kwargs) eps = args.eps # if src_models is not None, we train on adversarial examples that come # from multiple models adv_model_names = args.adv_models adv_models = [None] * len(adv_model_names) for i in range(len(adv_model_names)): type = get_model_type(adv_model_names[i]) adv_models[i] = load_model(adv_model_names[i], type=type).to(device) model = model_mnist(type=args.type).to(device) optimizer = optim.Adam(model.parameters()) # Train on MNIST model x_advs = [None] * (len(adv_models) + 1) for epoch in range(args.epochs): for batch_idx, (data, labels) in enumerate(train_loader): data, labels = data.to(device), labels.to(device) for i, m in enumerate(adv_models + [model]): grad = gen_grad(data, m, labels, loss='training') x_advs[i] = symbolic_fgs(data, grad, eps=eps) train(epoch, batch_idx, model, data, labels, optimizer, x_advs=x_advs) # Finally print the result correct = 0 with torch.no_grad(): for (data, labels) in test_loader: data, labels = data.to(device), labels.to(device) correct += test(model, data, labels) test_error = 100. - 100. * correct / len(test_loader.dataset) print('Test Set Error Rate: {:.2f}%'.format(test_error)) torch.save(model.state_dict(), args.model + '.pkl')
def train(args): world_size = len(args.hosts) is_distributed = world_size > 1 logger.debug('Number of hosts {}. Distributed training - {}'.format(world_size, is_distributed)) use_cuda = args.num_gpus > 0 logger.debug('Number of gpus available - {}'.format(args.num_gpus)) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} device = torch.device('cuda' if use_cuda else 'cpu') if is_distributed: # Initialize the distributed environment. backend = 'gloo' os.environ['WORLD_SIZE'] = str(world_size) host_rank = args.hosts.index(args.current_host) dist.init_process_group(backend=backend, rank=host_rank, world_size=world_size) logger.info('Initialized the distributed environment: \'{}\' backend on {} nodes. '.format( backend, dist.get_world_size()) + 'Current host rank is {}. Is cuda available: {}. Number of gpus: {}'.format( dist.get_rank(), torch.cuda.is_available(), args.num_gpus)) # set the seed for generating random numbers seed = 1 torch.manual_seed(seed) if use_cuda: torch.cuda.manual_seed(seed) train_sampler, train_loader = _get_train_data_loader(args.data_dir, is_distributed, args.batch_size, **kwargs) test_loader = _get_test_data_loader(args.data_dir, **kwargs) logger.debug('Processes {}/{} ({:.0f}%) of train data'.format( len(train_loader.sampler), len(train_loader.dataset), 100. * len(train_loader.sampler) / len(train_loader.dataset) )) logger.debug('Processes {}/{} ({:.0f}%) of test data'.format( len(test_loader.sampler), len(test_loader.dataset), 100. * len(test_loader.sampler) / len(test_loader.dataset) )) model = Net().to(device) if is_distributed and use_cuda: # multi-machine multi-gpu case logger.debug('Multi-machine multi-gpu: using DistributedDataParallel.') model = torch.nn.parallel.DistributedDataParallel(model) elif use_cuda: # single-machine multi-gpu case logger.debug('Single-machine multi-gpu: using DataParallel().cuda().') model = torch.nn.DataParallel(model) else: # single-machine or multi-machine cpu case logger.debug('Single-machine/multi-machine cpu: using DataParallel.') model = torch.nn.DataParallel(model) optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.5) log_interval = 100 for epoch in range(1, args.epochs + 1): if is_distributed: train_sampler.set_epoch(epoch) model.train() for batch_idx, (data, target) in enumerate(train_loader, 1): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() if is_distributed and not use_cuda: # average gradients manually for multi-machine cpu case only _average_gradients(model) optimizer.step() if batch_idx % log_interval == 0: logger.debug('Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.sampler), 100. * batch_idx / len(train_loader), loss.item())) accuracy = test(model, test_loader, device) save_model(model, args.model_dir) logger.debug('Overall test accuracy: {}'.format(accuracy))
def train_epoch(self, epoch): self.model.train() if not os.path.exists(self.csv_path): os.mkdir(self.csv_path) train_csv = os.path.join(self.csv_path, 'train.csv') pred_list, target_list, loss_list, pos_list = [], [], [], [] print('epoch: ', epoch) #print (self.model.dislstmcell.a) for batch_idx, item in enumerate(self.train_loader): if self.cfig['model_name'] in ['disrnn', 'trnn']: data, target, dist = item data, target, dist = data.to(self.device), target.to( self.device), dist.to(self.device) else: data, target, ID = item data, target = data.to(self.device), target.to(self.device) if self.cfig['model_name'][-3:] == 'rnn': data = data.permute([1, 0, 2, 3, 4]) #print ('data shape', data.shape, self.cfig['batch_size']) #data = pack_padded_sequence(data, [3] * self.cfig['batch_size']) # if use cell, we don't need it. self.optim.zero_grad() #print ('=================',data.shape) if self.cfig['model_name'] in ['disrnn', 'trnn']: pred = self.model(data, dist) else: pred = self.model(data) # here should be careful pred_prob = F.softmax(pred) #loss = self.criterion(pred, target) #print (pred.shape, target.shape) if batch_idx == 0: print('data.shape', data.shape) print('pred.shape', pred.shape) print('Epoch: ', epoch) loss = nn.CrossEntropyLoss()(pred, target) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 4) self.optim.step() print_str = 'train epoch=%d, batch_idx=%d/%d, loss=%.4f\n' % ( epoch, batch_idx, len(self.train_loader), loss.data[0]) #print(print_str) pred_cls = pred.data.max(1)[1] pos_list += pred_prob[:, 1].data.cpu().numpy().tolist() pred_list += pred_cls.data.cpu().numpy().tolist() target_list += target.data.cpu().numpy().tolist() loss_list.append(loss.data.cpu().numpy().tolist()) print(confusion_matrix(target_list, pred_list)) accuracy = accuracy_score(target_list, pred_list) fpr, tpr, threshold = metrics.roc_curve(target_list, pos_list) roc_auc = metrics.auc(fpr, tpr) #-------------------------save to csv -----------------------# if not os.path.exists(train_csv): csv_info = ['epoch', 'loss', 'auc', 'accuracy'] init_csv = pd.DataFrame() for key in csv_info: init_csv[key] = [] init_csv.to_csv(train_csv) df = pd.read_csv(train_csv) data = pd.DataFrame() tmp_epoch = df['epoch'].tolist() tmp_epoch.append(epoch) tmp_auc = df['auc'].tolist() tmp_auc.append(roc_auc) #print('------------------', tmp_epoch) tmp_loss = df['loss'].tolist() tmp_loss.append(np.mean(loss_list)) tmp_acc = df['accuracy'].tolist() tmp_acc.append(accuracy) data['epoch'], data['loss'], data['auc'], data[ 'accuracy'] = tmp_epoch, tmp_loss, tmp_auc, tmp_acc print('train accuracy: ', accuracy, 'train auc: ', roc_auc) data.to_csv(train_csv)
def eval_unpadded_loss(data, target, model, vocab, device, target_scale): data, target = data.to(device), target.to(device) with torch.no_grad(): data, target = autograd.Variable(data), autograd.Variable(target) output = model(data, vocab, device) return output, loss_function(output, target, target_scale)
def main(): args, save_dir = parse_arguments() ngpu = args.ngpu z_dim = args.z_dim batchSize = args.batch_size imageSize = args.image_size nepoch = args.nepoch data_dir = args.data_dir outf_img = check_folder(os.path.join(save_dir, 'img')) beta1 = 0.0 cuda = True cudnn.benchmark = True device = torch.device("cuda:0" if cuda else "cpu") ## set seed manualSeed = random.randint(1, 10000) print("Random Seed: ", manualSeed) random.seed(manualSeed) torch.manual_seed(manualSeed) ### load data dataset = CelebaDataseat(data_dir=data_dir, resolution=imageSize) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batchSize, shuffle=True, num_workers=int(10), drop_last=True) dataloader_iterator = iter(dataloader) print('Size of the training set: ', len(dataset)) ### set up models netE = Encoder(z_dim=z_dim).to(device) netE.apply(weights_init) netG = Generator(z_dim=z_dim).to(device) netDl = DiscriminatorL(z_dim=z_dim, ngpu=ngpu).to( device) # discriminator(on the latent variable) netD = Discriminator().to(device) # discriminator(on the image) ### define the losses criterion = nn.BCELoss() real_label = 1 fake_label = 0 fixed_noise = torch.randn(batchSize, z_dim, 1, 1, device=device) ### setup optimizer optimizerD = optim.Adam(netD.parameters(), lr=0.0004, betas=(beta1, 0.9)) optimizerDl = optim.Adam(netDl.parameters(), lr=0.0002, betas=(beta1, 0.9)) optimizerG = optim.Adam(netG.parameters(), lr=0.0001, betas=(beta1, 0.9)) optimizerE = optim.Adam(netE.parameters(), lr=0.0001, betas=(beta1, 0.9)) ### load previous trained model if_load, counter, checkpoint = load(save_dir) if if_load: netG.load_state_dict(checkpoint['netG_state_dict']) netE.load_state_dict(checkpoint['netE_state_dict']) netD.load_state_dict(checkpoint['netD_state_dict']) netDl.load_state_dict(checkpoint['netDl_state_dict']) optimizerG.load_state_dict(checkpoint['optG_state_dict']) optimizerE.load_state_dict(checkpoint['optE_state_dict']) optimizerD.load_state_dict(checkpoint['optD_state_dict']) optimizerDl.load_state_dict(checkpoint['optDl_state_dict']) for epoch in range(counter, nepoch): for i in range(5000 // (batchSize)): ############################ # (1) Update Dl network: maximize log(D(x)) + log(1 - D(G(z))) ########################### for _ in range(1): netDl.zero_grad() # train with real try: data = next(dataloader_iterator) except StopIteration: dataloader_iterator = iter(dataloader) data = next(dataloader_iterator) real_ = data.to(device) label = torch.full((batchSize, ), fake_label, device=device) output = netE(real_) output = netDl(output) errDl_real = criterion(output, label) errDl_real.backward() errDl = errDl_real # train with fake noise = torch.randn(batchSize, z_dim, device=device) label = torch.full((batchSize, ), real_label, device=device) output = netDl(noise) errDl_real = criterion(output, label) errDl_real.backward() errDl += errDl_real optimizerDl.step() ############################ # (2) Update D network: Hinge loss ########################### for _ in range(2): netD.zero_grad() # train with real try: data = next(dataloader_iterator) except StopIteration: dataloader_iterator = iter(dataloader) data = next(dataloader_iterator) real_ = data.to(device) out_real = netD(real_) noise = torch.randn(batchSize, z_dim, 1, 1, device=device) fake = netG(noise) out_fake = netD(fake.detach()) errD_real = (nn.ReLU()(0.5 + out_real)).mean() errD_real.backward() errD_fake = (nn.ReLU()(0.5 - out_fake)).mean() errD_fake.backward() errD = errD_real + errD_fake optimizerD.step() ############################ # (3) Update G & E network: maximize log(D(G(z))) ########################### for _ in range(1): netG.zero_grad() netE.zero_grad() try: data = next(dataloader_iterator) except StopIteration: dataloader_iterator = iter(dataloader) data = next(dataloader_iterator) real_ = data.to(device) real_ = real_.unsqueeze(1).repeat(1, 1, 1, 1, 1) real_ = real_.view(batchSize * 1, 3, imageSize, imageSize) encoded = netE(real_) fake_noise = encoded encoded = encoded.view(batchSize * 1, z_dim, 1, 1) rec_fake = netG(encoded) output = netD(rec_fake) outputN = netDl(fake_noise) label = torch.full((batchSize * 1, ), real_label, device=device) errG = criterionG(output, label, real_, rec_fake, outputN, batchSize) errG.backward() optimizerG.step() optimizerE.step() if i % 100 == 0: print( '[%d/%d][%d] Loss_D: %.4f, Loss_Dfake: %.4f, Loss_Dreal: %.4f, Loss_Dl: %.4f, Loss_G: %.4f' % (epoch, nepoch, i, errD.item(), errD_fake.item(), errD_real.item(), errDl.item(), errG.item())) if epoch % 20 == 0: noise = fixed_noise fake = netG(noise) fake = fake.view(batchSize, 3, imageSize, imageSize) vutils.save_image(fake.detach(), '%s/epoch_%04d.png' % (outf_img, epoch), normalize=True) vutils.save_image(real_.detach(), '%s/real_%04d.png' % (outf_img, epoch), normalize=True) vutils.save_image(rec_fake.detach(), '%s/reconst_%04d.png' % (outf_img, epoch), normalize=True) if epoch % 10 == 0: save_dict = { 'steps': epoch, 'netE_state_dict': netE.state_dict(), 'netG_state_dict': netG.state_dict(), 'netD_state_dict': netD.state_dict(), 'netDl_state_dict': netDl.state_dict(), 'optD_state_dict': optimizerD.state_dict(), 'optDl_state_dict': optimizerDl.state_dict(), 'optG_state_dict': optimizerG.state_dict(), 'optE_state_dict': optimizerE.state_dict() } torch.save(save_dict, os.path.join(save_dir, 'checkpoint.pkl')) torch.save(netE, os.path.join(save_dir, 'netE.pt')) torch.save(netG, os.path.join(save_dir, 'netG.pt'))
def train(args): is_distributed = len(args.hosts) > 1 and args.backend is not None logger.debug("Distributed training - {}".format(is_distributed)) use_cuda = args.num_gpus > 0 logger.debug("Number of gpus available - {}".format(args.num_gpus)) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} device = torch.device("cuda" if use_cuda else "cpu") if is_distributed: # Initialize the distributed environment. world_size = len(args.hosts) os.environ['WORLD_SIZE'] = str(world_size) host_rank = args.hosts.index(args.current_host) dist.init_process_group(backend=args.backend, rank=host_rank, world_size=world_size) logger.info('Initialized the distributed environment: \'{}\' backend on {} nodes. '.format( args.backend, dist.get_world_size()) + 'Current host rank is {}. Number of gpus: {}'.format( dist.get_rank(), args.num_gpus)) # set the seed for generating random numbers torch.manual_seed(args.seed) if use_cuda: torch.cuda.manual_seed(args.seed) train_loader = _get_train_data_loader(args.batch_size, args.data_dir, is_distributed, **kwargs) test_loader = _get_test_data_loader(args.test_batch_size, args.data_dir, **kwargs) logger.debug("Processes {}/{} ({:.0f}%) of train data".format( len(train_loader.sampler), len(train_loader.dataset), 100. * len(train_loader.sampler) / len(train_loader.dataset) )) logger.debug("Processes {}/{} ({:.0f}%) of test data".format( len(test_loader.sampler), len(test_loader.dataset), 100. * len(test_loader.sampler) / len(test_loader.dataset) )) model = Net().to(device) if is_distributed and use_cuda: # multi-machine multi-gpu case model = torch.nn.parallel.DistributedDataParallel(model) else: # single-machine multi-gpu case or single-machine or multi-machine cpu case model = torch.nn.DataParallel(model) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(1, args.epochs + 1): model.train() for batch_idx, (data, target) in enumerate(train_loader, 1): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() if is_distributed and not use_cuda: # average gradients manually for multi-machine cpu case only _average_gradients(model) optimizer.step() if batch_idx % args.log_interval == 0: logger.info('Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.sampler), 100. * batch_idx / len(train_loader), loss.item())) test(model, test_loader, device) save_model(model, args.model_dir)
def train(opt, vocab): # record starting time of the program start_time = time.time() torch.manual_seed(1) # load training data train_src_loc = opt.train_src train_tgt_loc = opt.train_tgt valid_src_loc = opt.valid_src valid_tgt_loc = opt.valid_tgt EMBEDDING_DIM = opt.embedding_dim HIDDEN_DIM = opt.hidden_dim BATCH_SIZE = opt.batch_size EPOCHS = opt.epochs device = opt.device train_from = opt.train_from LOAD_MODEL = len(train_from) > 0 train_data_limit = opt.train_data_limit valid_data_limit = opt.valid_data_limit save_model_loc = opt.save_model save_checkpoint_epochs = opt.save_checkpoint_epochs training_data = [] target_scale = 1 #scaler = MinMaxScaler(feature_range=(0, 1)) # word_to_ix = {} # word_to_ix['PAD'] = 0 # word_to_ix['UNK'] = 1 # word_to_ix['BOS'] = 2 # word_to_ix['EOS'] = 3 if train_data_limit > 0: print('Limited the training data to first {} samples'.format( train_data_limit)) else: print('Using the full training dataset.') print('Loading the training dataset...') with open(train_src_loc, 'r') as train_src_file, open(train_tgt_loc, 'r') as train_tgt_file: for train_src_line, train_tgt_line in zip(train_src_file, train_tgt_file): sent, ratio = (train_src_line.strip().split(), float(train_tgt_line)) # for word in sent: # if word not in word_to_ix: # word_to_ix[word] = len(word_to_ix) sent = prepare_sequence(sent, vocab) training_data.append((sent, ratio)) train_data_limit -= 1 if train_data_limit == 0: break # x_max = max([v for (k,v) in training_data]) # x_min = min([v for (k,v) in training_data]) # scaled_training_data = [] # for item in training_data: # scaled_training_data.append(item[0], item[1]/(x_max - x_min)) print('Successfully loaded the training dataset.') print("EMBEDDING_DIM = {}\nHIDDEN_DIM = {}\nBATCH_SIZE = {}\nEPOCHS = {}". format(EMBEDDING_DIM, HIDDEN_DIM, BATCH_SIZE, EPOCHS)) # Train the model: model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, vocab, device).to(device) if LOAD_MODEL: model.load_state_dict(torch.load(train_from)) print('Resuming the model from {0}'.format(train_from)) model.train() optimizer = optim.Adam(model.parameters(), lr=1.0e-6) print("Model's state_dict:") for param_tensor in model.state_dict(): print("\t", param_tensor, "\t", model.state_dict()[param_tensor].size()) # Print optimizer's state_dict print("Optimizer's state_dict:") for var_name in optimizer.state_dict(): print("\t", var_name, "\t", optimizer.state_dict()[var_name]) valid_data = [] if valid_data_limit > 0: print('Limited the test data to first {} samples'.format( valid_data_limit)) else: print('Using the full validation dataset.') print('Loading the validation dataset...') with open(valid_src_loc, 'r') as valid_src_file, open(valid_tgt_loc, 'r') as valid_tgt_file: for valid_src_line, valid_tgt_line in zip(valid_src_file, valid_tgt_file): sent, tag = (valid_src_line.strip().split(), float(valid_tgt_line)) sent = prepare_sequence(sent, vocab) valid_data.append((sent, tag)) valid_data_limit -= 1 if valid_data_limit == 0: break #valid_data = scaler.transform(valid_data) print('Successfully loaded the validation dataset.') my_collator = MyCollator(vocab) # collate also does the normalization of lengths valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=8, collate_fn=my_collator) # calculate total MSE loss on the test dataset before training the model initial_total_loss = 0 i = 0 for batch_idx, (data, target) in enumerate(valid_loader): _, loss = eval_unpadded_loss(data, target, model, vocab, device, target_scale) initial_total_loss += loss i += 1 initial_total_loss /= i print('Total MSE loss before training: {}'.format(initial_total_loss)) for epoch in range(EPOCHS): print("Starting epoch {}/{}...".format(epoch + 1, EPOCHS)) # this is a batch # collate also does the normalization of lengths train_loader = torch.utils.data.DataLoader(training_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=8, collate_fn=my_collator) for batch_idx, (data, target) in enumerate(train_loader): # print('Train batch id: {}/{}'.format(batch_idx, train_loader.__len__())) data, target = data.to(device), target.to(device) # Get our inputs ready for the network (turn them into Variables of word indices) data, target = autograd.Variable(data), autograd.Variable(target) # Remember that Pytorch accumulates gradients. # We need to clear them out before each instance model.zero_grad() # Also, we need to clear out the hidden state of the LSTM, # detaching it from its history on the last instance. # model.hidden = model.init_hidden() # ? why the line above is commented? # Run our forward pass. output = model(data, vocab, device) loss = loss_function(output, target, target_scale) loss.backward() optimizer.step() # if divmod(batch_idx, 100)[1] == 0: # gc.collect() # print('Collected garbage.') # calculate total MSE loss on the test dataset after training the model on each epoch total_loss = 0 i = 0 for batch_idx, (data, target) in enumerate(valid_loader): _, loss = eval_unpadded_loss(data, target, model, vocab, device, target_scale) total_loss += loss i += 1 total_loss /= i print('Total MSE loss after training on epoch {}: {}'.format( epoch + 1, total_loss)) if divmod(epoch + 1, save_checkpoint_epochs)[1] == 0: save_model_on_epoch_n = save_model_loc + '_epoch_' + repr( epoch + 1) + '.pt' torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'total_loss': total_loss, 'opt': opt }, save_model_on_epoch_n) print('Saved the model to {0}'.format(save_model_on_epoch_n)) print('Training completed!') # show the time consumed by the program print("Total run time: {}".format(str(time.time() - start_time)))
img_list = [] G_losses = [] D_losses = [] iters = 0 for epoch in range(num_epochs): for i, data in enumerate(train_loader): ############################ # (1) Update D network ########################### net_d.zero_grad() # create training data real_img = data.to(device) b_size = real_img.size(0) real_label = torch.full((b_size, ), real_idx, device=device, dtype=torch.float) noise = torch.randn(b_size, nz, 1, 1, device=device, dtype=torch.float) fake_img = net_g(noise) fake_label = torch.full((b_size, ), fake_idx,
lr=lr, betas=(beta1, 0.999), weight_decay=1e-5) # 識別器D用 optimizerG = optim.Adam(netG.parameters(), lr=lr, betas=(beta1, 0.999), weight_decay=1e-5) # 生成器G用 fixed_noise = torch.randn(1, 1, batch_size, 128, device=device) # 確認用の固定したノイズ # In[7]: # 学習のループ for epoch in range(n_epoch): for itr, data in enumerate(dataloader): real_image = data.to(device) # 元画像 sample_size = real_image.size(0) # 画像枚数 #バッチサイズ×128のノイズを一つ noise = torch.randn(1, 1, sample_size, 128, device=device) # 正規分布からノイズを生成 real_target = torch.full((sample_size, ), 1., device=device) # 元画像に対する識別信号の目標値「1」 fake_target = torch.full((sample_size, ), 0., device=device) # 贋作画像に対する識別信号の目標値「0」 ############################ # 識別器Dの更新 ########################### netD.zero_grad() # 勾配の初期化
discriminator = Discriminator(args.classes).to(device) discriminator.apply(weights_init) if args.discriminator != None: discriminator.load_state_dict(torch.load(args.discriminator, map_location=device_name), strict=False) loss = nn.BCELoss() optimizerG = optim.Adam(generator.parameters(), lr=args.learning_rate) optimizerD = optim.Adam(discriminator.parameters(), lr=args.learning_rate) labels = torch.LongTensor([args.class_label]).repeat(args.batch_size).to(device) fixed_noise = generator.build_input(args.batch_size, labels) for epoch in range(1, args.epochs + 1): for i, (data, class_labels) in enumerate(dataloader, 0): real_data = data.to(device) batch_size = real_data.size(0) discriminator.zero_grad() input_data = discriminator.build_input(real_data, class_labels) output = discriminator(input_data) label = torch.full((batch_size,), 1, device=device) loss_with_real = loss(output, label) loss_with_real.backward() D_x = output.mean().item() generator_input = generator.build_input(args.batch_size, class_labels) fake_data = generator(generator_input) fake_data = discriminator.build_input(fake_data, class_labels)
model.fc = nn.Linear(2048, 16) model.aux_logits = False model = model.to(device) print('params to update:') params_to_update = [] for name, param in model.named_parameters(): if param.requires_grad == True: params_to_update.append(param) print('\t', name) optimizer = torch.optim.Adam(params_to_update, lr=opt.lr, betas=(opt.beta1, opt.beta2), eps=opt.eps) print('| - training...') for epoch in range(1, opt.epochs + 1): model.train() for idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.cross_entropy(output, target) loss.backward() optimizer.step() if idx % opt.log_freq: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, idx * len(data), len(train_loader.dataset), 100. * idx / len(train_loader), loss.item())) model.eval() test_loss, correct = 0, 0 batch_size = test_loader.batch_size error, i = [], 0 with torch.no_grad(): for data, target in test_loader: